TextLib.php 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606
  1. <?php
  2. /**
  3. * 2010-08-31 ms
  4. */
  5. class TextLib {
  6. protected $text, $lenght, $char, $letter, $space, $word, $r_word, $sen, $r_sen, $para,
  7. $r_para, $beautified;
  8. public function __construct($text) {
  9. $this->text = $text;
  10. }
  11. /**
  12. * @param string $stringToCheck
  13. * @param tolerance (in %: 0 ... 1)
  14. * @return boolean $success
  15. * 2011-10-13 ms
  16. */
  17. public function isScreamFont($str = null, $tolerance = 0.4) {
  18. if ($str === null) {
  19. $str = $this->text;
  20. }
  21. if (empty($str)) {
  22. return false;
  23. }
  24. $res = preg_match_all('/[A-ZÄÖÜ]/u', $str, $uppercase);
  25. $uppercase = array_shift($uppercase);
  26. //echo returns($uppercase);
  27. $res = preg_match_all('/[a-zäöüß]/u', $str, $lowercase);
  28. $lowercase = array_shift($lowercase);
  29. //echo returns($lowercase);
  30. if (($countUpper = count($uppercase)) && $countUpper >= count($lowercase)) {
  31. return true;
  32. }
  33. //TODO: tolerance
  34. return false;
  35. }
  36. /* utf8 generell stuff */
  37. /**
  38. * Tests whether a string contains only 7-bit ASCII bytes. This is used to
  39. * determine when to use native functions or UTF-8 functions.
  40. *
  41. * $ascii = UTF8::is_ascii($str);
  42. *
  43. * @param string string to check
  44. * @return bool
  45. */
  46. public function isAscii($str = null) {
  47. if ($str === null) {
  48. $str = $this->text;
  49. }
  50. return !preg_match('/[^\x00-\x7F]/S', $str);
  51. }
  52. /**
  53. * Strips out device control codes in the ASCII range.
  54. *
  55. * $str = UTF8::strip_ascii_ctrl($str);
  56. *
  57. * @param string string to clean
  58. * @return string
  59. */
  60. public function stripAsciiCtrl($str = null) {
  61. if ($str === null) {
  62. $str = $this->text;
  63. }
  64. return preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S', '', $str);
  65. }
  66. /**
  67. * Strips out all non-7bit ASCII bytes.
  68. *
  69. * $str = UTF8::strip_non_ascii($str);
  70. *
  71. * @param string string to clean
  72. * @return string
  73. */
  74. public function stripNonAscii($str = null) {
  75. if ($str === null) {
  76. $str = $this->text;
  77. }
  78. return preg_replace('/[^\x00-\x7F]+/S', '', $str);
  79. }
  80. public function convertToOrd($str = null, $separator = '-') {
  81. /*
  82. if (!class_exists('UnicodeLib')) {
  83. App::uses('UnicodeLib', 'Tools.Lib');
  84. }
  85. */
  86. if ($str === null) {
  87. $str = $this->text;
  88. }
  89. $chars = preg_split('//', $str, -1);
  90. $res = array();
  91. foreach ($chars as $char) {
  92. //$res[] = UnicodeLib::ord($char);
  93. $res[] = ord($char);
  94. }
  95. return implode($separator, $res);
  96. }
  97. public function convertToOrdTable($str) {
  98. $res = '<table><tr>';
  99. $r = array('chr'=>array(), 'ord'=>array());
  100. $chars = preg_split('//', $str, -1);
  101. foreach ($chars as $char) {
  102. //$res[] = UnicodeLib::ord($char);
  103. $r['ord'][] = ord($char);
  104. $r['chr'][] = $char;
  105. }
  106. $res .= '<th>'.implode('</th><th>', $r['chr']).'</th>';
  107. $res .= '</tr>';
  108. $res .= '<tr>';
  109. $res .= '<td>'.implode('</th><th>', $r['ord']).'</td>';
  110. $res .= '</tr></table>';
  111. return $res;
  112. }
  113. /* other */
  114. /**
  115. * Explode a string of given tags into an array.
  116. */
  117. public function explodeTags($tags) {
  118. // This regexp allows the following types of user input:
  119. // this, "somecompany, llc", "and ""this"" w,o.rks", foo bar
  120. $regexp = '%(?:^|,\ *)("(?>[^"]*)(?>""[^"]* )*"|(?: [^",]*))%x';
  121. preg_match_all($regexp, $tags, $matches);
  122. $typed_tags = array_unique($matches[1]);
  123. $tags = array();
  124. foreach ($typed_tags as $tag) {
  125. // If a user has escaped a term (to demonstrate that it is a group,
  126. // or includes a comma or quote character), we remove the escape
  127. // formatting so to save the term into the database as the user intends.
  128. $tag = trim(str_replace('""', '"', preg_replace('/^"(.*)"$/', '\1', $tag)));
  129. if ($tag != "") {
  130. $tags[] = $tag;
  131. }
  132. }
  133. return $tags;
  134. }
  135. /**
  136. * Implode an array of tags into a string.
  137. */
  138. public function implodeTags($tags) {
  139. $encoded_tags = array();
  140. foreach ($tags as $tag) {
  141. // Commas and quotes in tag names are special cases, so encode them.
  142. if (strpos($tag, ',') !== FALSE || strpos($tag, '"') !== FALSE) {
  143. $tag = '"'. str_replace('"', '""', $tag) .'"';
  144. }
  145. $encoded_tags[] = $tag;
  146. }
  147. return implode(', ', $encoded_tags);
  148. }
  149. /**
  150. * Prevents [widow words](http://www.shauninman.com/archive/2006/08/22/widont_wordpress_plugin)
  151. * by inserting a non-breaking space between the last two words.
  152. *
  153. * echo Text::widont($text);
  154. *
  155. * @param string text to remove widows from
  156. * @return string
  157. */
  158. public function widont($str = null) {
  159. if ($str === null) {
  160. $str = $this->text;
  161. }
  162. $str = rtrim($str);
  163. $space = strrpos($str, ' ');
  164. if ($space !== FALSE) {
  165. $str = substr($str, 0, $space).'&nbsp;'.substr($str, $space + 1);
  166. }
  167. return $str;
  168. }
  169. /* text object specific */
  170. /**
  171. * @return array(char=>amount) for empty char or int amount for specific char
  172. * 2010-08-31 ms
  173. */
  174. public function occurrences($char = null, $caseSensitive = false) {
  175. if ($caseSensitive) {
  176. $str = $this->text;
  177. } else {
  178. if ($char !== null) {
  179. $char = strtolower($char);
  180. }
  181. $str = strtolower($this->text);
  182. }
  183. if ($char === null) {
  184. $occ = array();
  185. $str = str_split($str);
  186. foreach ($str as $value) {
  187. if (array_key_exists($value, $occ)) {
  188. $occ[$value] += 1;
  189. } else {
  190. $occ[$value] = 1;
  191. }
  192. }
  193. return $occ;
  194. } else {
  195. $occ = 0;
  196. $pos = 0;
  197. do {
  198. $pos = strpos($str, $char, $pos);
  199. if ($pos !== false) {
  200. $occ++;
  201. $pos++;
  202. } else {
  203. break;
  204. }
  205. } while (true);
  206. return $occ;
  207. }
  208. }
  209. /**
  210. * @return array(char=>amount) for empty char or int amount for specific char
  211. * 2010-08-31 ms
  212. */
  213. public function maxOccurrences($caseSensitive = false) {
  214. $arr = $this->occurrences(null, $caseSensitive);
  215. $max = 0;
  216. $occ = array();
  217. foreach ($arr as $key => $value) {
  218. if ($value === $max) {
  219. $occ[$key] = $value;
  220. } elseif ($value > $max) {
  221. $max = $value;
  222. $occ = array($key => $value);
  223. }
  224. }
  225. echo returns($occ);
  226. return $occ;
  227. }
  228. public function getLength() {
  229. if (!$this->lenght) {
  230. $this->lenght = mb_strlen($this->text);
  231. }
  232. return $this->lenght;
  233. }
  234. public function getCharacter() {
  235. if (!$this->char) $this->char = mb_strlen(strtr($this->text, array("\n" => '', "\r" =>
  236. '')));
  237. return $this->char;
  238. }
  239. public function getLetter() {
  240. if (!$this->letter) {
  241. $l_text = mb_strtolower($this->text);
  242. for ($i = 0; $i < $this->lenght; $i++)
  243. if (mb_strpos("abcdefghijklmnopqrstuvwxyzäöü", $l_text[$i]) != false) $this->
  244. letter++;
  245. }
  246. return $this->letter;
  247. }
  248. public function getSpace() {
  249. if (!$this->space) $this->space = mb_substr_count($this->text, " ") +
  250. mb_substr_count($this->text, "\t");
  251. return $this->space;
  252. }
  253. public function getSymbol() {
  254. return $this->getCharacter() - $this->getLetter() - $this->getSpace();
  255. }
  256. //TODO: improve it to work with case insensitivity and utf8 chars like é or î
  257. public function getWord($parse = false) {
  258. if (!$this->word && !$this->r_word) {
  259. @preg_match_all("/[A-Za-zäöüÄÖÜß\-'\\\"]+/", $this->text, $m);
  260. $this->word = count($m[0]);
  261. $this->r_word = $m[0];
  262. }
  263. return $parse ? $this->r_word : $this->word;
  264. }
  265. /**
  266. * @param options
  267. * - min_char, max_char, case_sensititive, ...
  268. * 2010-10-09 ms
  269. */
  270. public function words($options = array()) {
  271. if (true || !$this->xr_word) {
  272. $text = str_replace(array(PHP_EOL, NL, TB), ' ', $this->text);
  273. $pieces = explode(' ', $text);
  274. $pieces = array_unique($pieces);
  275. # strip chars like . or ,
  276. foreach ($pieces as $key => $piece) {
  277. if (empty($options['case_sensitive'])) {
  278. $piece = mb_strtolower($piece);
  279. }
  280. $search = array(',', '.', ';', ':', '#', '', '(', ')', '{', '}', '[', ']', '$', '%', '"', '!', '?', '<', '>', '=', '/');
  281. $search = array_merge($search, array(1, 2, 3, 4, 5, 6, 7, 8, 9, 0));
  282. $piece = str_replace($search, '', $piece);
  283. $piece = trim($piece);
  284. if (empty($piece) || !empty($options['min_char']) && mb_strlen($piece) < $options['min_char'] || !empty($options['max_char']) && mb_strlen($piece) > $options['max_char']) {
  285. unset($pieces[$key]);
  286. } else {
  287. $pieces[$key] = $piece;
  288. }
  289. }
  290. $pieces = array_unique($pieces);
  291. //$this->xr_word = $pieces;
  292. }
  293. return $pieces;
  294. }
  295. /**
  296. * @param options
  297. * - min_char, max_char, case_sensititive, sort ('asc', 'desc', 'length', 'alpha', false), limit...
  298. * 2010-10-09 ms
  299. */
  300. public function wordCount($options = array()) {
  301. if (true || !$this->rr_word) {
  302. $text = str_replace(array(NL, CR, PHP_EOL, TB), ' ', $this->text);
  303. $res = array();
  304. $search = array('*', '+', '~', ',', '.', ';', ':', '#', '', '(', ')', '{', '}', '[', ']', '$', '%', '“', '”', '—', '"', '‘', '’', '!', '?', '<', '>', '=', '/');
  305. $search = array_merge($search, array(1, 2, 3, 4, 5, 6, 7, 8, 9, 0));
  306. $text = str_replace($search, ' ', $text);
  307. $pieces = explode(' ', $text);
  308. //TODO: use array_count_values()?
  309. foreach ($pieces as $key => $piece) {
  310. if (empty($options['case_sensitive'])) {
  311. $piece = mb_strtolower($piece);
  312. }
  313. $piece = trim($piece);
  314. if (empty($piece) || !empty($options['min_char']) && mb_strlen($piece) < $options['min_char'] || !empty($options['max_char']) && mb_strlen($piece) > $options['max_char']) {
  315. unset($pieces[$key]);
  316. continue;
  317. }
  318. if (!array_key_exists($piece, $res)) {
  319. $res[$piece] = 0;
  320. }
  321. $res[$piece]++;
  322. }
  323. if (!empty($options['sort'])) {
  324. $sort = strtolower($options['sort']);
  325. if ($sort == 'asc') {
  326. asort($res);
  327. } elseif ($sort == 'desc') {
  328. arsort($res);
  329. } elseif ($sort == 'length') {
  330. //TODO:
  331. //uasort($res, $callback);
  332. } elseif ($sort == 'alpha') {
  333. ksort($res);
  334. }
  335. }
  336. if (!empty($options['limit'])) {
  337. $res = array_slice($res, 0, (int)$options['limit'], true);
  338. }
  339. //$this->rr_word = $res;
  340. }
  341. return $res; // $this->rr_word;
  342. }
  343. public function getSentence($parse = false) {
  344. if (!$this->sen && !$this->r_sen) {
  345. @preg_match_all("/[^:|;|\!|\.]+(:|;|\!|\.| )+/", $this->text, $m);
  346. $this->sen = count($m[0]);
  347. foreach ($m[0] as $s) $this->r_sen[] = strtr(trim($s), array("\n" => '', "\r" =>
  348. ''));
  349. }
  350. return $parse ? $this->r_sen : $this->sen;
  351. }
  352. public function getParagraph($parse = false) {
  353. if (!$this->para && !$this->r_para) {
  354. @preg_match_all("/[^\n]+?(:|;|\!|\.| )+\n/s", strtr($this->text, array("\r" =>
  355. '')) . "\n", $m);
  356. $this->para = count($m[0]);
  357. foreach ($m[0] as $p) $this->r_para[] = trim($p);
  358. }
  359. return $parse ? $this->r_para : $this->para;
  360. }
  361. public function beautify($wordwrap = false) {
  362. if (!$this->beautified) {
  363. $this->beautified = @preg_replace(array("/ {1,}/", "/\. {1,}\./", "/\. *(?!\.)/",
  364. "/(,|:|;|\!|\)) */", "/(,|:|;|\!|\)|\.) *\r\n/", "/(\r\n) {3,}/"), array(" ", ".",
  365. ". ", "$1 ", "$1\r\n", "\r\n\r\n"), $this->text);
  366. }
  367. return $wordwrap ? wordwrap($this->beautified, $wordwrap) : $this->beautified;
  368. }
  369. /**
  370. * High ASCII to Entities
  371. *
  372. * Converts High ascii text and MS Word special characters to character entities
  373. *
  374. * @access public
  375. * @param string
  376. * @return string
  377. */
  378. public function ascii_to_entities($str) {
  379. $count = 1;
  380. $out = '';
  381. $temp = array();
  382. for ($i = 0, $s = strlen($str); $i < $s; $i++) {
  383. $ordinal = ord($str[$i]);
  384. if ($ordinal < 128) {
  385. /*
  386. If the $temp array has a value but we have moved on, then it seems only
  387. fair that we output that entity and restart $temp before continuing. -Paul
  388. */
  389. if (count($temp) == 1) {
  390. $out .= '&#' . array_shift($temp) . ';';
  391. $count = 1;
  392. }
  393. $out .= $str[$i];
  394. } else {
  395. if (count($temp) == 0) {
  396. $count = ($ordinal < 224) ? 2 : 3;
  397. }
  398. $temp[] = $ordinal;
  399. if (count($temp) == $count) {
  400. $number = ($count == 3) ? (($temp['0'] % 16) * 4096) + (($temp['1'] % 64) * 64) + ($temp['2'] %
  401. 64) : (($temp['0'] % 32) * 64) + ($temp['1'] % 64);
  402. $out .= '&#' . $number . ';';
  403. $count = 1;
  404. $temp = array();
  405. }
  406. }
  407. }
  408. return $out;
  409. }
  410. // ------------------------------------------------------------------------
  411. /**
  412. * Entities to ASCII
  413. *
  414. * Converts character entities back to ASCII
  415. *
  416. * @access public
  417. * @param string
  418. * @param bool
  419. * @return string
  420. */
  421. public function entities_to_ascii($str, $all = true) {
  422. if (preg_match_all('/\&#(\d+)\;/', $str, $matches)) {
  423. for ($i = 0, $s = count($matches['0']); $i < $s; $i++) {
  424. $digits = $matches['1'][$i];
  425. $out = '';
  426. if ($digits < 128) {
  427. $out .= chr($digits);
  428. } elseif ($digits < 2048) {
  429. $out .= chr(192 + (($digits - ($digits % 64)) / 64));
  430. $out .= chr(128 + ($digits % 64));
  431. } else {
  432. $out .= chr(224 + (($digits - ($digits % 4096)) / 4096));
  433. $out .= chr(128 + ((($digits % 4096) - ($digits % 64)) / 64));
  434. $out .= chr(128 + ($digits % 64));
  435. }
  436. $str = str_replace($matches['0'][$i], $out, $str);
  437. }
  438. }
  439. if ($all) {
  440. $str = str_replace(array("&amp;", "&lt;", "&gt;", "&quot;", "&apos;", "&#45;"),
  441. array("&", "<", ">", "\"", "'", "-"), $str);
  442. }
  443. return $str;
  444. }
  445. /**
  446. * Reduce Double Slashes
  447. *
  448. * Converts double slashes in a string to a single slash,
  449. * except those found in http://
  450. *
  451. * http://www.some-site.com//index.php
  452. *
  453. * becomes:
  454. *
  455. * http://www.some-site.com/index.php
  456. *
  457. * @access public
  458. * @param string
  459. * @return string
  460. */
  461. public function reduce_double_slashes($str) {
  462. return preg_replace("#([^:])//+#", "\\1/", $str);
  463. }
  464. // ------------------------------------------------------------------------
  465. /**
  466. * Reduce Multiples
  467. *
  468. * Reduces multiple instances of a particular character. Example:
  469. *
  470. * Fred, Bill,, Joe, Jimmy
  471. *
  472. * becomes:
  473. *
  474. * Fred, Bill, Joe, Jimmy
  475. *
  476. * @access public
  477. * @param string
  478. * @param string the character you wish to reduce
  479. * @param bool TRUE/FALSE - whether to trim the character from the beginning/end
  480. * @return string
  481. */
  482. public function reduce_multiples($str, $character = ',', $trim = false) {
  483. $str = preg_replace('#' . preg_quote($character, '#') . '{2,}#', $character, $str);
  484. if ($trim === true) {
  485. $str = trim($str, $character);
  486. }
  487. return $str;
  488. }
  489. }
  490. /*
  491. //explode string, return word and number of repeation
  492. $r = explode('[spilit]', $value);
  493. //regex
  494. if ( preg_match('/([a-z]+)/', $r[0])) {
  495. preg_match_all( '/'. $r[0] .'/', $this -> checkString[$arrays], $match);
  496. } else {
  497. preg_match_all( '/\\'. $r[0] .'/', $this -> checkString[$arrays], $match);
  498. }
  499. //count chars
  500. if ( count($match[0]) <= $r[1]) {
  501. $this -> _is_valid[$arrays][$valData] = true;
  502. } else {
  503. $this -> _is_valid[$arrays][$valData] = false;
  504. //set errors array
  505. $this -> error[$arrays][] = $r[0] . $this -> error_max_time_char;
  506. }
  507. */