| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797 |
- <?php
- /**
- * Provides static methods for charset and locale safe string manipulation.
- *
- * Copyright 2003-2013 Horde LLC (http://www.horde.org/)
- *
- * See the enclosed file COPYING for license information (LGPL). If you
- * did not receive this file, see http://www.horde.org/licenses/lgpl21.
- *
- * @author Jan Schneider <jan@horde.org>
- * @category Horde
- * @license http://www.horde.org/licenses/lgpl21 LGPL 2.1
- * @package Util
- */
- class Horde_String
- {
- /**
- * lower() cache.
- *
- * @var array
- */
- static protected $_lowers = array();
- /**
- * upper() cache.
- *
- * @var array
- */
- static protected $_uppers = array();
- /**
- * Converts a string from one charset to another.
- *
- * Uses the iconv or the mbstring extensions.
- * The original string is returned if conversion failed or none
- * of the extensions were available.
- *
- * @param mixed $input The data to be converted. If $input is an an
- * array, the array's values get converted
- * recursively.
- * @param string $from The string's current charset.
- * @param string $to The charset to convert the string to.
- * @param boolean $force Force conversion?
- *
- * @return mixed The converted input data.
- */
- static public function convertCharset($input, $from, $to, $force = false)
- {
- /* Don't bother converting numbers. */
- if (is_numeric($input)) {
- return $input;
- }
- /* If the from and to character sets are identical, return now. */
- if (!$force && $from == $to) {
- return $input;
- }
- $from = self::lower($from);
- $to = self::lower($to);
- if (!$force && $from == $to) {
- return $input;
- }
- if (is_array($input)) {
- $tmp = array();
- reset($input);
- while (list($key, $val) = each($input)) {
- $tmp[self::_convertCharset($key, $from, $to)] = self::convertCharset($val, $from, $to, $force);
- }
- return $tmp;
- }
- if (is_object($input)) {
- // PEAR_Error/Exception objects are almost guaranteed to contain
- // recursion, which will cause a segfault in PHP. We should never
- // reach this line, but add a check.
- if (($input instanceof Exception) ||
- ($input instanceof PEAR_Error)) {
- return '';
- }
- $input = clone $input;
- $vars = get_object_vars($input);
- while (list($key, $val) = each($vars)) {
- $input->$key = self::convertCharset($val, $from, $to, $force);
- }
- return $input;
- }
- if (!is_string($input)) {
- return $input;
- }
- return self::_convertCharset($input, $from, $to);
- }
- /**
- * Internal function used to do charset conversion.
- *
- * @param string $input See self::convertCharset().
- * @param string $from See self::convertCharset().
- * @param string $to See self::convertCharset().
- *
- * @return string The converted string.
- */
- static protected function _convertCharset($input, $from, $to)
- {
- /* Use utf8_[en|de]code() if possible and if the string isn't too
- * large (less than 16 MB = 16 * 1024 * 1024 = 16777216 bytes) - these
- * functions use more memory. */
- if (Horde_Util::extensionExists('xml') &&
- ((strlen($input) < 16777216) ||
- !Horde_Util::extensionExists('iconv') ||
- !Horde_Util::extensionExists('mbstring'))) {
- if (($to == 'utf-8') &&
- in_array($from, array('iso-8859-1', 'us-ascii', 'utf-8'))) {
- return utf8_encode($input);
- }
- if (($from == 'utf-8') &&
- in_array($to, array('iso-8859-1', 'us-ascii', 'utf-8'))) {
- return utf8_decode($input);
- }
- }
- /* Try UTF7-IMAP conversions. */
- if (($from == 'utf7-imap') || ($to == 'utf7-imap')) {
- try {
- if ($from == 'utf7-imap') {
- return self::convertCharset(Horde_Imap_Client_Utf7imap::Utf7ImapToUtf8($input), 'UTF-8', $to);
- } else {
- if ($from == 'utf-8') {
- $conv = $input;
- } else {
- $conv = self::convertCharset($input, $from, 'UTF-8');
- }
- return Horde_Imap_Client_Utf7imap::Utf8ToUtf7Imap($conv);
- }
- } catch (Horde_Imap_Client_Exception $e) {
- return $input;
- }
- }
- /* Try iconv with transliteration. */
- if (Horde_Util::extensionExists('iconv')) {
- unset($php_errormsg);
- ini_set('track_errors', 1);
- $out = @iconv($from, $to . '//TRANSLIT', $input);
- $errmsg = isset($php_errormsg);
- ini_restore('track_errors');
- if (!$errmsg) {
- return $out;
- }
- }
- /* Try mbstring. */
- if (Horde_Util::extensionExists('mbstring')) {
- $out = @mb_convert_encoding($input, $to, self::_mbstringCharset($from));
- if (!empty($out)) {
- return $out;
- }
- }
- return $input;
- }
- /**
- * Makes a string lowercase.
- *
- * @param string $string The string to be converted.
- * @param boolean $locale If true the string will be converted based on
- * a given charset, locale independent else.
- * @param string $charset If $locale is true, the charset to use when
- * converting.
- *
- * @return string The string with lowercase characters.
- */
- static public function lower($string, $locale = false, $charset = null)
- {
- if ($locale) {
- if (Horde_Util::extensionExists('mbstring')) {
- if (is_null($charset)) {
- throw new InvalidArgumentException('$charset argument must not be null');
- }
- $ret = @mb_strtolower($string, self::_mbstringCharset($charset));
- if (!empty($ret)) {
- return $ret;
- }
- }
- return strtolower($string);
- }
- if (!isset(self::$_lowers[$string])) {
- $language = setlocale(LC_CTYPE, 0);
- setlocale(LC_CTYPE, 'C');
- self::$_lowers[$string] = strtolower($string);
- setlocale(LC_CTYPE, $language);
- }
- return self::$_lowers[$string];
- }
- /**
- * Makes a string uppercase.
- *
- * @param string $string The string to be converted.
- * @param boolean $locale If true the string will be converted based on a
- * given charset, locale independent else.
- * @param string $charset If $locale is true, the charset to use when
- * converting. If not provided the current charset.
- *
- * @return string The string with uppercase characters.
- */
- static public function upper($string, $locale = false, $charset = null)
- {
- if ($locale) {
- if (Horde_Util::extensionExists('mbstring')) {
- if (is_null($charset)) {
- throw new InvalidArgumentException('$charset argument must not be null');
- }
- $ret = @mb_strtoupper($string, self::_mbstringCharset($charset));
- if (!empty($ret)) {
- return $ret;
- }
- }
- return strtoupper($string);
- }
- if (!isset(self::$_uppers[$string])) {
- $language = setlocale(LC_CTYPE, 0);
- setlocale(LC_CTYPE, 'C');
- self::$_uppers[$string] = strtoupper($string);
- setlocale(LC_CTYPE, $language);
- }
- return self::$_uppers[$string];
- }
- /**
- * Returns a string with the first letter capitalized if it is
- * alphabetic.
- *
- * @param string $string The string to be capitalized.
- * @param boolean $locale If true the string will be converted based on a
- * given charset, locale independent else.
- * @param string $charset The charset to use, defaults to current charset.
- *
- * @return string The capitalized string.
- */
- static public function ucfirst($string, $locale = false, $charset = null)
- {
- if ($locale) {
- if (is_null($charset)) {
- throw new InvalidArgumentException('$charset argument must not be null');
- }
- $first = self::substr($string, 0, 1, $charset);
- if (self::isAlpha($first, $charset)) {
- $string = self::upper($first, true, $charset) . self::substr($string, 1, null, $charset);
- }
- } else {
- $string = self::upper(substr($string, 0, 1), false) . substr($string, 1);
- }
- return $string;
- }
- /**
- * Returns a string with the first letter of each word capitalized if it is
- * alphabetic.
- *
- * Sentences are splitted into words at whitestrings.
- *
- * @param string $string The string to be capitalized.
- * @param boolean $locale If true the string will be converted based on a
- * given charset, locale independent else.
- * @param string $charset The charset to use, defaults to current charset.
- *
- * @return string The capitalized string.
- */
- static public function ucwords($string, $locale = false, $charset = null)
- {
- $words = preg_split('/(\s+)/', $string, -1, PREG_SPLIT_DELIM_CAPTURE);
- for ($i = 0, $c = count($words); $i < $c; $i += 2) {
- $words[$i] = self::ucfirst($words[$i], $locale, $charset);
- }
- return implode('', $words);
- }
- /**
- * Returns part of a string.
- *
- * @param string $string The string to be converted.
- * @param integer $start The part's start position, zero based.
- * @param integer $length The part's length.
- * @param string $charset The charset to use when calculating the part's
- * position and length, defaults to current
- * charset.
- *
- * @return string The string's part.
- */
- static public function substr($string, $start, $length = null,
- $charset = 'UTF-8')
- {
- if (is_null($length)) {
- $length = self::length($string, $charset) - $start;
- }
- if ($length == 0) {
- return '';
- }
- /* Try mbstring. */
- if (Horde_Util::extensionExists('mbstring')) {
- $ret = @mb_substr($string, $start, $length, self::_mbstringCharset($charset));
- /* mb_substr() returns empty string on failure. */
- if (strlen($ret)) {
- return $ret;
- }
- }
- /* Try iconv. */
- if (Horde_Util::extensionExists('iconv')) {
- $ret = @iconv_substr($string, $start, $length, $charset);
- /* iconv_substr() returns false on failure. */
- if ($ret !== false) {
- return $ret;
- }
- }
- return substr($string, $start, $length);
- }
- /**
- * Returns the character (not byte) length of a string.
- *
- * @param string $string The string to return the length of.
- * @param string $charset The charset to use when calculating the string's
- * length.
- *
- * @return integer The string's length.
- */
- static public function length($string, $charset = 'UTF-8')
- {
- $charset = self::lower($charset);
- if ($charset == 'utf-8' || $charset == 'utf8') {
- return strlen(utf8_decode($string));
- }
- if (Horde_Util::extensionExists('mbstring')) {
- $ret = @mb_strlen($string, self::_mbstringCharset($charset));
- if (!empty($ret)) {
- return $ret;
- }
- }
- return strlen($string);
- }
- /**
- * Returns the numeric position of the first occurrence of $needle
- * in the $haystack string.
- *
- * @param string $haystack The string to search through.
- * @param string $needle The string to search for.
- * @param integer $offset Allows to specify which character in haystack
- * to start searching.
- * @param string $charset The charset to use when searching for the
- * $needle string.
- *
- * @return integer The position of first occurrence.
- */
- static public function pos($haystack, $needle, $offset = 0,
- $charset = 'UTF-8')
- {
- if (Horde_Util::extensionExists('mbstring')) {
- $track_errors = ini_set('track_errors', 1);
- $ret = @mb_strpos($haystack, $needle, $offset, self::_mbstringCharset($charset));
- ini_set('track_errors', $track_errors);
- if (!isset($php_errormsg)) {
- return $ret;
- }
- }
- return strpos($haystack, $needle, $offset);
- }
- /**
- * Returns the numeric position of the last occurrence of $needle
- * in the $haystack string.
- *
- * @param string $haystack The string to search through.
- * @param string $needle The string to search for.
- * @param integer $offset Allows to specify which character in haystack
- * to start searching.
- * @param string $charset The charset to use when searching for the
- * $needle string.
- *
- * @return integer The position of first occurrence.
- */
- static public function rpos($haystack, $needle, $offset = 0,
- $charset = 'UTF-8')
- {
- if (Horde_Util::extensionExists('mbstring')) {
- $track_errors = ini_set('track_errors', 1);
- $ret = @mb_strrpos($haystack, $needle, $offset, self::_mbstringCharset($charset));
- ini_set('track_errors', $track_errors);
- if (!isset($php_errormsg)) {
- return $ret;
- }
- }
- return strrpos($haystack, $needle, $offset);
- }
- /**
- * Returns a string padded to a certain length with another string.
- * This method behaves exactly like str_pad() but is multibyte safe.
- *
- * @param string $input The string to be padded.
- * @param integer $length The length of the resulting string.
- * @param string $pad The string to pad the input string with. Must
- * be in the same charset like the input string.
- * @param const $type The padding type. One of STR_PAD_LEFT,
- * STR_PAD_RIGHT, or STR_PAD_BOTH.
- * @param string $charset The charset of the input and the padding
- * strings.
- *
- * @return string The padded string.
- */
- static public function pad($input, $length, $pad = ' ',
- $type = STR_PAD_RIGHT, $charset = 'UTF-8')
- {
- $mb_length = self::length($input, $charset);
- $sb_length = strlen($input);
- $pad_length = self::length($pad, $charset);
- /* Return if we already have the length. */
- if ($mb_length >= $length) {
- return $input;
- }
- /* Shortcut for single byte strings. */
- if ($mb_length == $sb_length && $pad_length == strlen($pad)) {
- return str_pad($input, $length, $pad, $type);
- }
- switch ($type) {
- case STR_PAD_LEFT:
- $left = $length - $mb_length;
- $output = self::substr(str_repeat($pad, ceil($left / $pad_length)), 0, $left, $charset) . $input;
- break;
- case STR_PAD_BOTH:
- $left = floor(($length - $mb_length) / 2);
- $right = ceil(($length - $mb_length) / 2);
- $output = self::substr(str_repeat($pad, ceil($left / $pad_length)), 0, $left, $charset) .
- $input .
- self::substr(str_repeat($pad, ceil($right / $pad_length)), 0, $right, $charset);
- break;
- case STR_PAD_RIGHT:
- $right = $length - $mb_length;
- $output = $input . self::substr(str_repeat($pad, ceil($right / $pad_length)), 0, $right, $charset);
- break;
- }
- return $output;
- }
- /**
- * Wraps the text of a message.
- *
- * @param string $string String containing the text to wrap.
- * @param integer $width Wrap the string at this number of
- * characters.
- * @param string $break Character(s) to use when breaking lines.
- * @param boolean $cut Whether to cut inside words if a line
- * can't be wrapped.
- * @param boolean $line_folding Whether to apply line folding rules per
- * RFC 822 or similar. The correct break
- * characters including leading whitespace
- * have to be specified too.
- *
- * @return string String containing the wrapped text.
- */
- static public function wordwrap($string, $width = 75, $break = "\n",
- $cut = false, $line_folding = false)
- {
- $wrapped = '';
- while (self::length($string, 'UTF-8') > $width) {
- $line = self::substr($string, 0, $width, 'UTF-8');
- $string = self::substr($string, self::length($line, 'UTF-8'), null, 'UTF-8');
- // Make sure we didn't cut a word, unless we want hard breaks
- // anyway.
- if (!$cut && preg_match('/^(.+?)((\s|\r?\n).*)/us', $string, $match)) {
- $line .= $match[1];
- $string = $match[2];
- }
- // Wrap at existing line breaks.
- if (preg_match('/^(.*?)(\r?\n)(.*)$/su', $line, $match)) {
- $wrapped .= $match[1] . $match[2];
- $string = $match[3] . $string;
- continue;
- }
- // Wrap at the last colon or semicolon followed by a whitespace if
- // doing line folding.
- if ($line_folding &&
- preg_match('/^(.*?)(;|:)(\s+.*)$/u', $line, $match)) {
- $wrapped .= $match[1] . $match[2] . $break;
- $string = $match[3] . $string;
- continue;
- }
- // Wrap at the last whitespace of $line.
- $sub = $line_folding
- ? '(.+[^\s])'
- : '(.*)';
- if (preg_match('/^' . $sub . '(\s+)(.*)$/u', $line, $match)) {
- $wrapped .= $match[1] . $break;
- $string = ($line_folding ? $match[2] : '') . $match[3] . $string;
- continue;
- }
- // Hard wrap if necessary.
- if ($cut) {
- $wrapped .= $line . $break;
- continue;
- }
- $wrapped .= $line;
- }
- return $wrapped . $string;
- }
- /**
- * Wraps the text of a message.
- *
- * @param string $text String containing the text to wrap.
- * @param integer $length Wrap $text at this number of characters.
- * @param string $break_char Character(s) to use when breaking lines.
- * @param boolean $quote Ignore lines that are wrapped with the '>'
- * character (RFC 2646)? If true, we don't
- * remove any padding whitespace at the end of
- * the string.
- *
- * @return string String containing the wrapped text.
- */
- static public function wrap($text, $length = 80, $break_char = "\n",
- $quote = false)
- {
- $paragraphs = array();
- foreach (preg_split('/\r?\n/', $text) as $input) {
- if ($quote && (strpos($input, '>') === 0)) {
- $line = $input;
- } else {
- /* We need to handle the Usenet-style signature line
- * separately; since the space after the two dashes is
- * REQUIRED, we don't want to trim the line. */
- if ($input != '-- ') {
- $input = rtrim($input);
- }
- $line = self::wordwrap($input, $length, $break_char);
- }
- $paragraphs[] = $line;
- }
- return implode($break_char, $paragraphs);
- }
- /**
- * Return a truncated string, suitable for notifications.
- *
- * @param string $text The original string.
- * @param integer $length The maximum length.
- *
- * @return string The truncated string, if longer than $length.
- */
- static public function truncate($text, $length = 100)
- {
- return (self::length($text) > $length)
- ? rtrim(self::substr($text, 0, $length - 3)) . '...'
- : $text;
- }
- /**
- * Return an abbreviated string, with characters in the middle of the
- * excessively long string replaced by '...'.
- *
- * @param string $text The original string.
- * @param integer $length The length at which to abbreviate.
- *
- * @return string The abbreviated string, if longer than $length.
- */
- static public function abbreviate($text, $length = 20)
- {
- return (self::length($text) > $length)
- ? rtrim(self::substr($text, 0, round(($length - 3) / 2))) . '...' . ltrim(self::substr($text, (($length - 3) / 2) * -1))
- : $text;
- }
- /**
- * Returns the common leading part of two strings.
- *
- * @param string $str1 A string.
- * @param string $str2 Another string.
- *
- * @return string The start of $str1 and $str2 that is identical in both.
- */
- static public function common($str1, $str2)
- {
- for ($result = '', $i = 0;
- isset($str1[$i]) && isset($str2[$i]) && $str1[$i] == $str2[$i];
- $i++) {
- $result .= $str1[$i];
- }
- return $result;
- }
- /**
- * Returns true if the every character in the parameter is an alphabetic
- * character.
- *
- * @param string $string The string to test.
- * @param string $charset The charset to use when testing the string.
- *
- * @return boolean True if the parameter was alphabetic only.
- */
- static public function isAlpha($string, $charset)
- {
- if (!Horde_Util::extensionExists('mbstring')) {
- return ctype_alpha($string);
- }
- $charset = self::_mbstringCharset($charset);
- $old_charset = mb_regex_encoding();
- if ($charset != $old_charset) {
- @mb_regex_encoding($charset);
- }
- $alpha = !@mb_ereg_match('[^[:alpha:]]', $string);
- if ($charset != $old_charset) {
- @mb_regex_encoding($old_charset);
- }
- return $alpha;
- }
- /**
- * Returns true if ever character in the parameter is a lowercase letter in
- * the current locale.
- *
- * @param string $string The string to test.
- * @param string $charset The charset to use when testing the string.
- *
- * @return boolean True if the parameter was lowercase.
- */
- static public function isLower($string, $charset)
- {
- return ((self::lower($string, true, $charset) === $string) &&
- self::isAlpha($string, $charset));
- }
- /**
- * Returns true if every character in the parameter is an uppercase letter
- * in the current locale.
- *
- * @param string $string The string to test.
- * @param string $charset The charset to use when testing the string.
- *
- * @return boolean True if the parameter was uppercase.
- */
- static public function isUpper($string, $charset)
- {
- return ((self::upper($string, true, $charset) === $string) &&
- self::isAlpha($string, $charset));
- }
- /**
- * Performs a multibyte safe regex match search on the text provided.
- *
- * @param string $text The text to search.
- * @param array $regex The regular expressions to use, without perl
- * regex delimiters (e.g. '/' or '|').
- * @param string $charset The character set of the text.
- *
- * @return array The matches array from the first regex that matches.
- */
- static public function regexMatch($text, $regex, $charset = null)
- {
- if (!empty($charset)) {
- $regex = self::convertCharset($regex, $charset, 'utf-8');
- $text = self::convertCharset($text, $charset, 'utf-8');
- }
- $matches = array();
- foreach ($regex as $val) {
- if (preg_match('/' . $val . '/u', $text, $matches)) {
- break;
- }
- }
- if (!empty($charset)) {
- $matches = self::convertCharset($matches, 'utf-8', $charset);
- }
- return $matches;
- }
- /**
- * Check to see if a string is valid UTF-8.
- *
- * @param string $text The text to check.
- *
- * @return boolean True if valid UTF-8.
- */
- static public function validUtf8($text)
- {
- $text = strval($text);
- for ($i = 0, $len = strlen($text); $i < $len; ++$i) {
- $c = ord($text[$i]);
- if ($c > 128) {
- if ($c > 247) {
- // STD 63 (RFC 3629) eliminates 5 & 6-byte characters.
- return false;
- } elseif ($c > 239) {
- $j = 3;
- } elseif ($c > 223) {
- $j = 2;
- } elseif ($c > 191) {
- $j = 1;
- } else {
- return false;
- }
- if (($i + $j) > $len) {
- return false;
- }
- do {
- $c = ord($text[++$i]);
- if (($c < 128) || ($c > 191)) {
- return false;
- }
- } while (--$j);
- }
- }
- return true;
- }
- /**
- * Workaround charsets that don't work with mbstring functions.
- *
- * @param string $charset The original charset.
- *
- * @return string The charset to use with mbstring functions.
- */
- static protected function _mbstringCharset($charset)
- {
- /* mbstring functions do not handle the 'ks_c_5601-1987' &
- * 'ks_c_5601-1989' charsets. However, these charsets are used, for
- * example, by various versions of Outlook to send Korean characters.
- * Use UHC (CP949) encoding instead. See, e.g.,
- * http://lists.w3.org/Archives/Public/ietf-charsets/2001AprJun/0030.html */
- return in_array(self::lower($charset), array('ks_c_5601-1987', 'ks_c_5601-1989'))
- ? 'UHC'
- : $charset;
- }
- /**
- * Strip UTF-8 byte order mark (BOM) from string data.
- *
- * @param string $str Input string (UTF-8).
- *
- * @return string Stripped string (UTF-8).
- */
- static public function trimUtf8Bom($str)
- {
- return (substr($str, 0, 3) == pack('CCC', 239, 187, 191))
- ? substr($str, 3)
- : $str;
- }
- }
|