TextLib.php 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422
  1. <?php
  2. App::uses('String', 'Utility');
  3. /**
  4. * Extend String.
  5. * //TODO: cleanup
  6. *
  7. */
  8. class TextLib extends String {
  9. public $text, $length, $char, $letter, $space, $word, $rWord, $sen, $rSen, $para, $rPara, $beautified;
  10. public function __construct($text = null) {
  11. $this->text = $text;
  12. }
  13. /**
  14. * Read tab data (tab-separated data).
  15. *
  16. * @return array
  17. */
  18. public function readTab() {
  19. $pieces = explode("\n", $this->text);
  20. $result = array();
  21. foreach ($pieces as $piece) {
  22. $tmp = explode("\t", trim($piece, "\r\n"));
  23. $result[] = $tmp;
  24. }
  25. return $result;
  26. }
  27. /**
  28. * Read with a specific pattern.
  29. *
  30. * E.g.: '%s,%s,%s'
  31. *
  32. * @param string $pattern
  33. * @return array
  34. */
  35. public function readWithPattern($pattern) {
  36. $pieces = explode("\n", $this->text);
  37. $result = array();
  38. foreach ($pieces as $piece) {
  39. $result[] = sscanf(trim($piece, "\r\n"), $pattern);
  40. }
  41. return $result;
  42. }
  43. /**
  44. * Count words in a text.
  45. *
  46. * //TODO use str_word_count() instead!!!
  47. *
  48. * @param string $text
  49. * @return int
  50. */
  51. public static function numberOfWords($text) {
  52. $count = 0;
  53. $words = explode(' ', $text);
  54. foreach ($words as $word) {
  55. $word = trim($word);
  56. if (!empty($word)) {
  57. $count++;
  58. }
  59. }
  60. return $count;
  61. }
  62. /**
  63. * Count chars in a text.
  64. *
  65. * Options:
  66. * - 'whitespace': If whitespace should be counted, as well, defaults to false
  67. *
  68. * @param string $text
  69. * @return int
  70. */
  71. public static function numberOfChars($text, $options = array()) {
  72. $text = str_replace(array("\r", "\n", "\t", ' '), '', $text);
  73. $count = mb_strlen($text);
  74. return $count;
  75. }
  76. /**
  77. * Return an abbreviated string, with characters in the middle of the
  78. * excessively long string replaced by $ending.
  79. *
  80. * @param string $text The original string.
  81. * @param int $length The length at which to abbreviate.
  82. * @return string The abbreviated string, if longer than $length.
  83. */
  84. public static function abbreviate($text, $length = 20, $ending = '...') {
  85. if (mb_strlen($text) <= $length) {
  86. return $text;
  87. }
  88. return rtrim(mb_substr($text, 0, round(($length - 3) / 2))) . $ending . ltrim(mb_substr($text, (($length - 3) / 2) * -1));
  89. }
  90. /**
  91. * TextLib::convertToOrd()
  92. *
  93. * @param string $str
  94. * @param string $separator
  95. * @return string
  96. */
  97. public function convertToOrd($str = null, $separator = '-') {
  98. /*
  99. if (!class_exists('UnicodeLib')) {
  100. App::uses('UnicodeLib', 'Tools.Lib');
  101. }
  102. */
  103. if ($str === null) {
  104. $str = $this->text;
  105. }
  106. $chars = preg_split('//', $str, -1);
  107. $res = array();
  108. foreach ($chars as $char) {
  109. //$res[] = UnicodeLib::ord($char);
  110. $res[] = ord($char);
  111. }
  112. return implode($separator, $res);
  113. }
  114. public static function convertToOrdTable($str, $maxCols = 20) {
  115. $res = '<table>';
  116. $r = array('chr' => array(), 'ord' => array());
  117. $chars = preg_split('//', $str, -1);
  118. $count = 0;
  119. foreach ($chars as $key => $char) {
  120. if ($maxCols && $maxCols < $count || $key === count($chars) - 1) {
  121. $res .= '<tr><th>' . implode('</th><th>', $r['chr']) . '</th>';
  122. $res .= '</tr>';
  123. $res .= '<tr>';
  124. $res .= '<td>' . implode('</th><th>', $r['ord']) . '</td></tr>';
  125. $count = 0;
  126. $r = array('chr' => array(), 'ord' => array());
  127. }
  128. $count++;
  129. //$res[] = UnicodeLib::ord($char);
  130. $r['ord'][] = ord($char);
  131. $r['chr'][] = $char;
  132. }
  133. $res .= '</table>';
  134. return $res;
  135. }
  136. /**
  137. * Explode a string of given tags into an array.
  138. */
  139. public function explodeTags($tags) {
  140. // This regexp allows the following types of user input:
  141. // this, "somecompany, llc", "and ""this"" w,o.rks", foo bar
  142. $regexp = '%(?:^|,\ *)("(?>[^"]*)(?>""[^"]* )*"|(?: [^",]*))%x';
  143. preg_match_all($regexp, $tags, $matches);
  144. $typedTags = array_unique($matches[1]);
  145. $tags = array();
  146. foreach ($typedTags as $tag) {
  147. // If a user has escaped a term (to demonstrate that it is a group,
  148. // or includes a comma or quote character), we remove the escape
  149. // formatting so to save the term into the database as the user intends.
  150. $tag = trim(str_replace('""', '"', preg_replace('/^"(.*)"$/', '\1', $tag)));
  151. if ($tag) {
  152. $tags[] = $tag;
  153. }
  154. }
  155. return $tags;
  156. }
  157. /**
  158. * Implode an array of tags into a string.
  159. */
  160. public function implodeTags($tags) {
  161. $encodedTags = array();
  162. foreach ($tags as $tag) {
  163. // Commas and quotes in tag names are special cases, so encode them.
  164. if (strpos($tag, ',') !== false || strpos($tag, '"') !== false) {
  165. $tag = '"' . str_replace('"', '""', $tag) . '"';
  166. }
  167. $encodedTags[] = $tag;
  168. }
  169. return implode(', ', $encodedTags);
  170. }
  171. /**
  172. * Prevents [widow words](http://www.shauninman.com/archive/2006/08/22/widont_wordpress_plugin)
  173. * by inserting a non-breaking space between the last two words.
  174. *
  175. * echo Text::widont($text);
  176. *
  177. * @param string text to remove widows from
  178. * @return string
  179. */
  180. public function widont($str = null) {
  181. if ($str === null) {
  182. $str = $this->text;
  183. }
  184. $str = rtrim($str);
  185. $space = strrpos($str, ' ');
  186. if ($space !== false) {
  187. $str = substr($str, 0, $space) . '&nbsp;' . substr($str, $space + 1);
  188. }
  189. return $str;
  190. }
  191. /* text object specific */
  192. /**
  193. * Extract words
  194. *
  195. * @param options
  196. * - min_char, max_char, case_sensititive, ...
  197. * @return array
  198. */
  199. public function words($options = array()) {
  200. if (true || !$this->xrWord) {
  201. $text = str_replace(array(PHP_EOL, NL, TB), ' ', $this->text);
  202. $pieces = explode(' ', $text);
  203. $pieces = array_unique($pieces);
  204. // strip chars like . or ,
  205. foreach ($pieces as $key => $piece) {
  206. if (empty($options['case_sensitive'])) {
  207. $piece = mb_strtolower($piece);
  208. }
  209. $search = array(',', '.', ';', ':', '#', '', '(', ')', '{', '}', '[', ']', '$', '%', '"', '!', '?', '<', '>', '=', '/');
  210. $search = array_merge($search, array(1, 2, 3, 4, 5, 6, 7, 8, 9, 0));
  211. $piece = str_replace($search, '', $piece);
  212. $piece = trim($piece);
  213. if (empty($piece) || !empty($options['min_char']) && mb_strlen($piece) < $options['min_char'] || !empty($options['max_char']) && mb_strlen($piece) > $options['max_char']) {
  214. unset($pieces[$key]);
  215. } else {
  216. $pieces[$key] = $piece;
  217. }
  218. }
  219. $pieces = array_unique($pieces);
  220. //$this->xrWord = $pieces;
  221. }
  222. return $pieces;
  223. }
  224. /**
  225. * Limit the number of words in a string.
  226. *
  227. * <code>
  228. * // Returns "This is a..."
  229. * echo TextExt::maxWords('This is a sentence.', 3);
  230. *
  231. * // Limit the number of words and append a custom ending
  232. * echo Str::words('This is a sentence.', 3, '---');
  233. * </code>
  234. *
  235. * @param string $value
  236. * @param int $words
  237. * @param array $options
  238. * - ellipsis
  239. * - html
  240. * @return string
  241. */
  242. public static function maxWords($value, $words = 100, $options = array()) {
  243. $defaults = array(
  244. 'ellipsis' => '...'
  245. );
  246. if (!empty($options['html']) && Configure::read('App.encoding') === 'UTF-8') {
  247. $defaults['ellipsis'] = "\xe2\x80\xa6";
  248. }
  249. $options += $defaults;
  250. if (trim($value) === '') {
  251. return '';
  252. }
  253. preg_match('/^\s*+(?:\S++\s*+){1,' . $words . '}/u', $value, $matches);
  254. $end = $options['ellipsis'];
  255. if (mb_strlen($value) === mb_strlen($matches[0])) {
  256. $end = '';
  257. }
  258. return rtrim($matches[0]) . $end;
  259. }
  260. /**
  261. * High ASCII to Entities
  262. *
  263. * Converts High ascii text and MS Word special characters to character entities
  264. *
  265. * @param string
  266. * @return string
  267. */
  268. public function asciiToEntities($str) {
  269. $count = 1;
  270. $out = '';
  271. $temp = array();
  272. for ($i = 0, $s = strlen($str); $i < $s; $i++) {
  273. $ordinal = ord($str[$i]);
  274. if ($ordinal < 128) {
  275. /*
  276. If the $temp array has a value but we have moved on, then it seems only
  277. fair that we output that entity and restart $temp before continuing. -Paul
  278. */
  279. if (count($temp) == 1) {
  280. $out .= '&#' . array_shift($temp) . ';';
  281. $count = 1;
  282. }
  283. $out .= $str[$i];
  284. } else {
  285. if (count($temp) == 0) {
  286. $count = ($ordinal < 224) ? 2 : 3;
  287. }
  288. $temp[] = $ordinal;
  289. if (count($temp) == $count) {
  290. $number = ($count == 3) ? (($temp['0'] % 16) * 4096) + (($temp['1'] % 64) * 64) + ($temp['2'] %
  291. 64) : (($temp['0'] % 32) * 64) + ($temp['1'] % 64);
  292. $out .= '&#' . $number . ';';
  293. $count = 1;
  294. $temp = array();
  295. }
  296. }
  297. }
  298. return $out;
  299. }
  300. /**
  301. * Entities to ASCII
  302. *
  303. * Converts character entities back to ASCII
  304. *
  305. * @param string
  306. * @param bool
  307. * @return string
  308. */
  309. public function EntitiesToAscii($str, $all = true) {
  310. if (preg_match_all('/\&#(\d+)\;/', $str, $matches)) {
  311. for ($i = 0, $s = count($matches['0']); $i < $s; $i++) {
  312. $digits = $matches['1'][$i];
  313. $out = '';
  314. if ($digits < 128) {
  315. $out .= chr($digits);
  316. } elseif ($digits < 2048) {
  317. $out .= chr(192 + (($digits - ($digits % 64)) / 64));
  318. $out .= chr(128 + ($digits % 64));
  319. } else {
  320. $out .= chr(224 + (($digits - ($digits % 4096)) / 4096));
  321. $out .= chr(128 + ((($digits % 4096) - ($digits % 64)) / 64));
  322. $out .= chr(128 + ($digits % 64));
  323. }
  324. $str = str_replace($matches['0'][$i], $out, $str);
  325. }
  326. }
  327. if ($all) {
  328. $str = str_replace(array("&amp;", "&lt;", "&gt;", "&quot;", "&apos;", "&#45;"),
  329. array("&", "<", ">", "\"", "'", "-"), $str);
  330. }
  331. return $str;
  332. }
  333. /**
  334. * Reduce Double Slashes
  335. *
  336. * Converts double slashes in a string to a single slash,
  337. * except those found in http://
  338. *
  339. * http://www.some-site.com//index.php
  340. *
  341. * becomes:
  342. *
  343. * http://www.some-site.com/index.php
  344. *
  345. * @param string
  346. * @return string
  347. */
  348. public function reduce_double_slashes($str) {
  349. return preg_replace("#([^:])//+#", "\\1/", $str);
  350. }
  351. // ------------------------------------------------------------------------
  352. /**
  353. * Reduce Multiples
  354. *
  355. * Reduces multiple instances of a particular character. Example:
  356. *
  357. * Fred, Bill,, Joe, Jimmy
  358. *
  359. * becomes:
  360. *
  361. * Fred, Bill, Joe, Jimmy
  362. *
  363. * @param string
  364. * @param string the character you wish to reduce
  365. * @param bool TRUE/FALSE - whether to trim the character from the beginning/end
  366. * @return string
  367. */
  368. public function reduce_multiples($str, $character = ',', $trim = false) {
  369. $str = preg_replace('#' . preg_quote($character, '#') . '{2,}#', $character, $str);
  370. if ($trim === true) {
  371. $str = trim($str, $character);
  372. }
  373. return $str;
  374. }
  375. }