text = $text; } /** * @param string $stringToCheck * @param tolerance (in %: 0 ... 1) * @return boolean $success * 2011-10-13 ms */ public function isScreamFont($str = null, $tolerance = 0.4) { if ($str === null) { $str = $this->text; } if (empty($str)) { return false; } $res = preg_match_all('/[A-ZÄÖÜ]/u', $str, $uppercase); $uppercase = array_shift($uppercase); //echo returns($uppercase); $res = preg_match_all('/[a-zäöüß]/u', $str, $lowercase); $lowercase = array_shift($lowercase); //echo returns($lowercase); if (($countUpper = count($uppercase)) && $countUpper >= count($lowercase)) { return true; } //TODO: tolerance return false; } /** * @param string * @param string $additionalChars * - e.g. `-()0123456789` */ public function isWord($str = null, $additionalChars = null) { return preg_match('/^\w+$/', $str); } /* utf8 generell stuff */ /** * Tests whether a string contains only 7-bit ASCII bytes. This is used to * determine when to use native functions or UTF-8 functions. * * $ascii = UTF8::is_ascii($str); * * @param string string to check * @return bool */ public function isAscii($str = null) { if ($str === null) { $str = $this->text; } return !preg_match('/[^\x00-\x7F]/S', $str); } /** * Strips out device control codes in the ASCII range. * * $str = UTF8::strip_ascii_ctrl($str); * * @param string string to clean * @return string */ public function stripAsciiCtrl($str = null) { if ($str === null) { $str = $this->text; } return preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S', '', $str); } /** * Strips out all non-7bit ASCII bytes. * * $str = UTF8::strip_non_ascii($str); * * @param string string to clean * @return string */ public function stripNonAscii($str = null) { if ($str === null) { $str = $this->text; } return preg_replace('/[^\x00-\x7F]+/S', '', $str); } public function convertToOrd($str = null, $separator = '-') { /* if (!class_exists('UnicodeLib')) { App::uses('UnicodeLib', 'Tools.Lib'); } */ if ($str === null) { $str = $this->text; } $chars = preg_split('//', $str, -1); $res = array(); foreach ($chars as $char) { //$res[] = UnicodeLib::ord($char); $res[] = ord($char); } return implode($separator, $res); } public static function convertToOrdTable($str, $maxCols = 20) { $res = ''; $r = array('chr'=>array(), 'ord'=>array()); $chars = preg_split('//', $str, -1); $count = 0; foreach ($chars as $key => $char) { if ($maxCols && $maxCols < $count || $key === count($chars)-1) { $res .= ''; $res .= ''; $res .= ''; $res .= ''; $count = 0; $r = array('chr'=>array(), 'ord'=>array()); } $count++; //$res[] = UnicodeLib::ord($char); $r['ord'][] = ord($char); $r['chr'][] = $char; } $res .= '
'.implode('', $r['chr']).'
'.implode('', $r['ord']).'
'; return $res; } /* other */ /** * Explode a string of given tags into an array. */ public function explodeTags($tags) { // This regexp allows the following types of user input: // this, "somecompany, llc", "and ""this"" w,o.rks", foo bar $regexp = '%(?:^|,\ *)("(?>[^"]*)(?>""[^"]* )*"|(?: [^",]*))%x'; preg_match_all($regexp, $tags, $matches); $typed_tags = array_unique($matches[1]); $tags = array(); foreach ($typed_tags as $tag) { // If a user has escaped a term (to demonstrate that it is a group, // or includes a comma or quote character), we remove the escape // formatting so to save the term into the database as the user intends. $tag = trim(str_replace('""', '"', preg_replace('/^"(.*)"$/', '\1', $tag))); if ($tag != "") { $tags[] = $tag; } } return $tags; } /** * Implode an array of tags into a string. */ public function implodeTags($tags) { $encoded_tags = array(); foreach ($tags as $tag) { // Commas and quotes in tag names are special cases, so encode them. if (strpos($tag, ',') !== FALSE || strpos($tag, '"') !== FALSE) { $tag = '"'. str_replace('"', '""', $tag) .'"'; } $encoded_tags[] = $tag; } return implode(', ', $encoded_tags); } /** * Prevents [widow words](http://www.shauninman.com/archive/2006/08/22/widont_wordpress_plugin) * by inserting a non-breaking space between the last two words. * * echo Text::widont($text); * * @param string text to remove widows from * @return string */ public function widont($str = null) { if ($str === null) { $str = $this->text; } $str = rtrim($str); $space = strrpos($str, ' '); if ($space !== FALSE) { $str = substr($str, 0, $space).' '.substr($str, $space + 1); } return $str; } /* text object specific */ /** * @return array(char=>amount) for empty char or int amount for specific char * 2010-08-31 ms */ public function occurrences($char = null, $caseSensitive = false) { if ($caseSensitive) { $str = $this->text; } else { if ($char !== null) { $char = strtolower($char); } $str = strtolower($this->text); } if ($char === null) { $occ = array(); $str = str_split($str); foreach ($str as $value) { if (array_key_exists($value, $occ)) { $occ[$value] += 1; } else { $occ[$value] = 1; } } return $occ; } else { $occ = 0; $pos = 0; do { $pos = strpos($str, $char, $pos); if ($pos !== false) { $occ++; $pos++; } else { break; } } while (true); return $occ; } } /** * @return array(char=>amount) for empty char or int amount for specific char * 2010-08-31 ms */ public function maxOccurrences($caseSensitive = false) { $arr = $this->occurrences(null, $caseSensitive); $max = 0; $occ = array(); foreach ($arr as $key => $value) { if ($value === $max) { $occ[$key] = $value; } elseif ($value > $max) { $max = $value; $occ = array($key => $value); } } echo returns($occ); return $occ; } public function getLength() { if (!$this->lenght) { $this->lenght = mb_strlen($this->text); } return $this->lenght; } public function getCharacter() { if (!$this->char) $this->char = mb_strlen(strtr($this->text, array("\n" => '', "\r" => ''))); return $this->char; } public function getLetter() { if (!$this->letter) { $l_text = mb_strtolower($this->text); for ($i = 0; $i < $this->lenght; $i++) if (mb_strpos("abcdefghijklmnopqrstuvwxyzäöü", $l_text[$i]) != false) $this-> letter++; } return $this->letter; } public function getSpace() { if (!$this->space) $this->space = mb_substr_count($this->text, " ") + mb_substr_count($this->text, "\t"); return $this->space; } public function getSymbol() { return $this->getCharacter() - $this->getLetter() - $this->getSpace(); } //TODO: improve it to work with case insensitivity and utf8 chars like é or î public function getWord($parse = false) { if (!$this->word && !$this->r_word) { @preg_match_all("/[A-Za-zäöüÄÖÜß\-'\\\"]+/", $this->text, $m); $this->word = count($m[0]); $this->r_word = $m[0]; } return $parse ? $this->r_word : $this->word; } /** * @param options * - min_char, max_char, case_sensititive, ... * 2010-10-09 ms */ public function words($options = array()) { if (true || !$this->xr_word) { $text = str_replace(array(PHP_EOL, NL, TB), ' ', $this->text); $pieces = explode(' ', $text); $pieces = array_unique($pieces); # strip chars like . or , foreach ($pieces as $key => $piece) { if (empty($options['case_sensitive'])) { $piece = mb_strtolower($piece); } $search = array(',', '.', ';', ':', '#', '', '(', ')', '{', '}', '[', ']', '$', '%', '"', '!', '?', '<', '>', '=', '/'); $search = array_merge($search, array(1, 2, 3, 4, 5, 6, 7, 8, 9, 0)); $piece = str_replace($search, '', $piece); $piece = trim($piece); if (empty($piece) || !empty($options['min_char']) && mb_strlen($piece) < $options['min_char'] || !empty($options['max_char']) && mb_strlen($piece) > $options['max_char']) { unset($pieces[$key]); } else { $pieces[$key] = $piece; } } $pieces = array_unique($pieces); //$this->xr_word = $pieces; } return $pieces; } /** * @param options * - min_char, max_char, case_sensititive, sort ('asc', 'desc', 'length', 'alpha', false), limit... * 2010-10-09 ms */ public function wordCount($options = array()) { if (true || !$this->rr_word) { $text = str_replace(array(NL, CR, PHP_EOL, TB), ' ', $this->text); $res = array(); $search = array('*', '+', '~', ',', '.', ';', ':', '#', '', '(', ')', '{', '}', '[', ']', '$', '%', '“', '”', '—', '"', '‘', '’', '!', '?', '<', '>', '=', '/'); $search = array_merge($search, array(1, 2, 3, 4, 5, 6, 7, 8, 9, 0)); $text = str_replace($search, ' ', $text); $pieces = explode(' ', $text); //TODO: use array_count_values()? foreach ($pieces as $key => $piece) { if (empty($options['case_sensitive'])) { $piece = mb_strtolower($piece); } $piece = trim($piece); if (empty($piece) || !empty($options['min_char']) && mb_strlen($piece) < $options['min_char'] || !empty($options['max_char']) && mb_strlen($piece) > $options['max_char']) { unset($pieces[$key]); continue; } if (!array_key_exists($piece, $res)) { $res[$piece] = 0; } $res[$piece]++; } if (!empty($options['sort'])) { $sort = strtolower($options['sort']); if ($sort == 'asc') { asort($res); } elseif ($sort == 'desc') { arsort($res); } elseif ($sort == 'length') { //TODO: //uasort($res, $callback); } elseif ($sort == 'alpha') { ksort($res); } } if (!empty($options['limit'])) { $res = array_slice($res, 0, (int)$options['limit'], true); } //$this->rr_word = $res; } return $res; // $this->rr_word; } public function getSentence($parse = false) { if (!$this->sen && !$this->r_sen) { @preg_match_all("/[^:|;|\!|\.]+(:|;|\!|\.| )+/", $this->text, $m); $this->sen = count($m[0]); foreach ($m[0] as $s) $this->r_sen[] = strtr(trim($s), array("\n" => '', "\r" => '')); } return $parse ? $this->r_sen : $this->sen; } public function getParagraph($parse = false) { if (!$this->para && !$this->r_para) { @preg_match_all("/[^\n]+?(:|;|\!|\.| )+\n/s", strtr($this->text, array("\r" => '')) . "\n", $m); $this->para = count($m[0]); foreach ($m[0] as $p) $this->r_para[] = trim($p); } return $parse ? $this->r_para : $this->para; } public function beautify($wordwrap = false) { if (!$this->beautified) { $this->beautified = @preg_replace(array("/ {1,}/", "/\. {1,}\./", "/\. *(?!\.)/", "/(,|:|;|\!|\)) */", "/(,|:|;|\!|\)|\.) *\r\n/", "/(\r\n) {3,}/"), array(" ", ".", ". ", "$1 ", "$1\r\n", "\r\n\r\n"), $this->text); } return $wordwrap ? wordwrap($this->beautified, $wordwrap) : $this->beautified; } /** * High ASCII to Entities * * Converts High ascii text and MS Word special characters to character entities * * @access public * @param string * @return string */ public function ascii_to_entities($str) { $count = 1; $out = ''; $temp = array(); for ($i = 0, $s = strlen($str); $i < $s; $i++) { $ordinal = ord($str[$i]); if ($ordinal < 128) { /* If the $temp array has a value but we have moved on, then it seems only fair that we output that entity and restart $temp before continuing. -Paul */ if (count($temp) == 1) { $out .= '&#' . array_shift($temp) . ';'; $count = 1; } $out .= $str[$i]; } else { if (count($temp) == 0) { $count = ($ordinal < 224) ? 2 : 3; } $temp[] = $ordinal; if (count($temp) == $count) { $number = ($count == 3) ? (($temp['0'] % 16) * 4096) + (($temp['1'] % 64) * 64) + ($temp['2'] % 64) : (($temp['0'] % 32) * 64) + ($temp['1'] % 64); $out .= '&#' . $number . ';'; $count = 1; $temp = array(); } } } return $out; } // ------------------------------------------------------------------------ /** * Entities to ASCII * * Converts character entities back to ASCII * * @access public * @param string * @param bool * @return string */ public function entities_to_ascii($str, $all = true) { if (preg_match_all('/\&#(\d+)\;/', $str, $matches)) { for ($i = 0, $s = count($matches['0']); $i < $s; $i++) { $digits = $matches['1'][$i]; $out = ''; if ($digits < 128) { $out .= chr($digits); } elseif ($digits < 2048) { $out .= chr(192 + (($digits - ($digits % 64)) / 64)); $out .= chr(128 + ($digits % 64)); } else { $out .= chr(224 + (($digits - ($digits % 4096)) / 4096)); $out .= chr(128 + ((($digits % 4096) - ($digits % 64)) / 64)); $out .= chr(128 + ($digits % 64)); } $str = str_replace($matches['0'][$i], $out, $str); } } if ($all) { $str = str_replace(array("&", "<", ">", """, "'", "-"), array("&", "<", ">", "\"", "'", "-"), $str); } return $str; } /** * Reduce Double Slashes * * Converts double slashes in a string to a single slash, * except those found in http:// * * http://www.some-site.com//index.php * * becomes: * * http://www.some-site.com/index.php * * @access public * @param string * @return string */ public function reduce_double_slashes($str) { return preg_replace("#([^:])//+#", "\\1/", $str); } // ------------------------------------------------------------------------ /** * Reduce Multiples * * Reduces multiple instances of a particular character. Example: * * Fred, Bill,, Joe, Jimmy * * becomes: * * Fred, Bill, Joe, Jimmy * * @access public * @param string * @param string the character you wish to reduce * @param bool TRUE/FALSE - whether to trim the character from the beginning/end * @return string */ public function reduce_multiples($str, $character = ',', $trim = false) { $str = preg_replace('#' . preg_quote($character, '#') . '{2,}#', $character, $str); if ($trim === true) { $str = trim($str, $character); } return $str; } } /* //explode string, return word and number of repeation $r = explode('[spilit]', $value); //regex if ( preg_match('/([a-z]+)/', $r[0])) { preg_match_all( '/'. $r[0] .'/', $this -> checkString[$arrays], $match); } else { preg_match_all( '/\\'. $r[0] .'/', $this -> checkString[$arrays], $match); } //count chars if ( count($match[0]) <= $r[1]) { $this -> _is_valid[$arrays][$valData] = true; } else { $this -> _is_valid[$arrays][$valData] = false; //set errors array $this -> error[$arrays][] = $r[0] . $this -> error_max_time_char; } */