TextLib.php 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420
  1. <?php
  2. App::uses('String', 'Utility');
  3. /**
  4. * Extend String.
  5. * //TODO: cleanup
  6. *
  7. */
  8. class TextLib extends String {
  9. protected $text, $length, $char, $letter, $space, $word, $r_word, $sen, $r_sen, $para,
  10. $r_para, $beautified;
  11. public function __construct($text = null) {
  12. $this->text = $text;
  13. }
  14. /**
  15. * Read tab data (tab-separated data).
  16. *
  17. * @return array
  18. */
  19. public function readTab() {
  20. $pieces = explode("\n", $this->text);
  21. $result = array();
  22. foreach ($pieces as $piece) {
  23. $tmp = explode("\t", trim($piece, "\r\n"));
  24. $result[] = $tmp;
  25. }
  26. return $result;
  27. }
  28. /**
  29. * Read with a specific pattern.
  30. *
  31. * E.g.: '%s,%s,%s'
  32. *
  33. * @param string $pattern
  34. * @return array
  35. */
  36. public function readWithPattern($pattern) {
  37. $pieces = explode("\n", $this->text);
  38. $result = array();
  39. foreach ($pieces as $piece) {
  40. $result[] = sscanf(trim($piece, "\r\n"), $pattern);
  41. }
  42. return $result;
  43. }
  44. /**
  45. * Count words in a text.
  46. *
  47. * //TODO use str_word_count() instead!!!
  48. *
  49. * @param string $text
  50. * @return integer
  51. */
  52. public static function numberOfWords($text) {
  53. $count = 0;
  54. $words = explode(' ', $text);
  55. foreach ($words as $word) {
  56. $word = trim($word);
  57. if (!empty($word)) {
  58. $count++;
  59. }
  60. }
  61. return $count;
  62. }
  63. /**
  64. * Count chars in a text.
  65. *
  66. * Options:
  67. * - 'whitespace': If whitespace should be counted, as well, defaults to false
  68. *
  69. * @param string $text
  70. * @return integer
  71. */
  72. public static function numberOfChars($text, $options = array()) {
  73. $text = str_replace(array("\r", "\n", "\t", ' '), '', $text);
  74. $count = mb_strlen($text);
  75. return $count;
  76. }
  77. /**
  78. * Return an abbreviated string, with characters in the middle of the
  79. * excessively long string replaced by $ending.
  80. *
  81. * @param string $text The original string.
  82. * @param integer $length The length at which to abbreviate.
  83. * @return string The abbreviated string, if longer than $length.
  84. */
  85. public static function abbreviate($text, $length = 20, $ending = '...') {
  86. return (mb_strlen($text) > $length)
  87. ? rtrim(mb_substr($text, 0, round(($length - 3) / 2))) . $ending . ltrim(mb_substr($text, (($length - 3) / 2) * -1))
  88. : $text;
  89. }
  90. /* other */
  91. public function convertToOrd($str = null, $separator = '-') {
  92. /*
  93. if (!class_exists('UnicodeLib')) {
  94. App::uses('UnicodeLib', 'Tools.Lib');
  95. }
  96. */
  97. if ($str === null) {
  98. $str = $this->text;
  99. }
  100. $chars = preg_split('//', $str, -1);
  101. $res = array();
  102. foreach ($chars as $char) {
  103. //$res[] = UnicodeLib::ord($char);
  104. $res[] = ord($char);
  105. }
  106. return implode($separator, $res);
  107. }
  108. public static function convertToOrdTable($str, $maxCols = 20) {
  109. $res = '<table>';
  110. $r = array('chr' => array(), 'ord' => array());
  111. $chars = preg_split('//', $str, -1);
  112. $count = 0;
  113. foreach ($chars as $key => $char) {
  114. if ($maxCols && $maxCols < $count || $key === count($chars) - 1) {
  115. $res .= '<tr><th>' . implode('</th><th>', $r['chr']) . '</th>';
  116. $res .= '</tr>';
  117. $res .= '<tr>';
  118. $res .= '<td>' . implode('</th><th>', $r['ord']) . '</td></tr>';
  119. $count = 0;
  120. $r = array('chr' => array(), 'ord' => array());
  121. }
  122. $count++;
  123. //$res[] = UnicodeLib::ord($char);
  124. $r['ord'][] = ord($char);
  125. $r['chr'][] = $char;
  126. }
  127. $res .= '</table>';
  128. return $res;
  129. }
  130. /**
  131. * Explode a string of given tags into an array.
  132. */
  133. public function explodeTags($tags) {
  134. // This regexp allows the following types of user input:
  135. // this, "somecompany, llc", "and ""this"" w,o.rks", foo bar
  136. $regexp = '%(?:^|,\ *)("(?>[^"]*)(?>""[^"]* )*"|(?: [^",]*))%x';
  137. preg_match_all($regexp, $tags, $matches);
  138. $typedTags = array_unique($matches[1]);
  139. $tags = array();
  140. foreach ($typedTags as $tag) {
  141. // If a user has escaped a term (to demonstrate that it is a group,
  142. // or includes a comma or quote character), we remove the escape
  143. // formatting so to save the term into the database as the user intends.
  144. $tag = trim(str_replace('""', '"', preg_replace('/^"(.*)"$/', '\1', $tag)));
  145. if ($tag) {
  146. $tags[] = $tag;
  147. }
  148. }
  149. return $tags;
  150. }
  151. /**
  152. * Implode an array of tags into a string.
  153. */
  154. public function implodeTags($tags) {
  155. $encodedTags = array();
  156. foreach ($tags as $tag) {
  157. // Commas and quotes in tag names are special cases, so encode them.
  158. if (strpos($tag, ',') !== false || strpos($tag, '"') !== false) {
  159. $tag = '"' . str_replace('"', '""', $tag) . '"';
  160. }
  161. $encodedTags[] = $tag;
  162. }
  163. return implode(', ', $encodedTags);
  164. }
  165. /**
  166. * Prevents [widow words](http://www.shauninman.com/archive/2006/08/22/widont_wordpress_plugin)
  167. * by inserting a non-breaking space between the last two words.
  168. *
  169. * echo Text::widont($text);
  170. *
  171. * @param string text to remove widows from
  172. * @return string
  173. */
  174. public function widont($str = null) {
  175. if ($str === null) {
  176. $str = $this->text;
  177. }
  178. $str = rtrim($str);
  179. $space = strrpos($str, ' ');
  180. if ($space !== false) {
  181. $str = substr($str, 0, $space) . '&nbsp;' . substr($str, $space + 1);
  182. }
  183. return $str;
  184. }
  185. /* text object specific */
  186. /**
  187. * Extract words
  188. *
  189. * @param options
  190. * - min_char, max_char, case_sensititive, ...
  191. * @return array
  192. */
  193. public function words($options = array()) {
  194. if (true || !$this->xrWord) {
  195. $text = str_replace(array(PHP_EOL, NL, TB), ' ', $this->text);
  196. $pieces = explode(' ', $text);
  197. $pieces = array_unique($pieces);
  198. # strip chars like . or ,
  199. foreach ($pieces as $key => $piece) {
  200. if (empty($options['case_sensitive'])) {
  201. $piece = mb_strtolower($piece);
  202. }
  203. $search = array(',', '.', ';', ':', '#', '', '(', ')', '{', '}', '[', ']', '$', '%', '"', '!', '?', '<', '>', '=', '/');
  204. $search = array_merge($search, array(1, 2, 3, 4, 5, 6, 7, 8, 9, 0));
  205. $piece = str_replace($search, '', $piece);
  206. $piece = trim($piece);
  207. if (empty($piece) || !empty($options['min_char']) && mb_strlen($piece) < $options['min_char'] || !empty($options['max_char']) && mb_strlen($piece) > $options['max_char']) {
  208. unset($pieces[$key]);
  209. } else {
  210. $pieces[$key] = $piece;
  211. }
  212. }
  213. $pieces = array_unique($pieces);
  214. //$this->xrWord = $pieces;
  215. }
  216. return $pieces;
  217. }
  218. /**
  219. * Limit the number of words in a string.
  220. *
  221. * <code>
  222. * // Returns "This is a..."
  223. * echo TextExt::maxWords('This is a sentence.', 3);
  224. *
  225. * // Limit the number of words and append a custom ending
  226. * echo Str::words('This is a sentence.', 3, '---');
  227. * </code>
  228. *
  229. * @param string $value
  230. * @param integer $words
  231. * @param array $options
  232. * - ellipsis
  233. * - html
  234. * @return string
  235. */
  236. public static function maxWords($value, $words = 100, $options = array()) {
  237. $default = array(
  238. 'ellipsis' => '...'
  239. );
  240. if (!empty($options['html']) && Configure::read('App.encoding') === 'UTF-8') {
  241. $default['ellipsis'] = "\xe2\x80\xa6";
  242. }
  243. $options = array_merge($default, $options);
  244. if (trim($value) === '') {
  245. return '';
  246. }
  247. preg_match('/^\s*+(?:\S++\s*+){1,' . $words . '}/u', $value, $matches);
  248. $end = $options['ellipsis'];
  249. if (mb_strlen($value) === mb_strlen($matches[0])) {
  250. $end = '';
  251. }
  252. return rtrim($matches[0]) . $end;
  253. }
  254. /**
  255. * High ASCII to Entities
  256. *
  257. * Converts High ascii text and MS Word special characters to character entities
  258. *
  259. * @param string
  260. * @return string
  261. */
  262. public function ascii_to_entities($str) {
  263. $count = 1;
  264. $out = '';
  265. $temp = array();
  266. for ($i = 0, $s = strlen($str); $i < $s; $i++) {
  267. $ordinal = ord($str[$i]);
  268. if ($ordinal < 128) {
  269. /*
  270. If the $temp array has a value but we have moved on, then it seems only
  271. fair that we output that entity and restart $temp before continuing. -Paul
  272. */
  273. if (count($temp) == 1) {
  274. $out .= '&#' . array_shift($temp) . ';';
  275. $count = 1;
  276. }
  277. $out .= $str[$i];
  278. } else {
  279. if (count($temp) == 0) {
  280. $count = ($ordinal < 224) ? 2 : 3;
  281. }
  282. $temp[] = $ordinal;
  283. if (count($temp) == $count) {
  284. $number = ($count == 3) ? (($temp['0'] % 16) * 4096) + (($temp['1'] % 64) * 64) + ($temp['2'] %
  285. 64) : (($temp['0'] % 32) * 64) + ($temp['1'] % 64);
  286. $out .= '&#' . $number . ';';
  287. $count = 1;
  288. $temp = array();
  289. }
  290. }
  291. }
  292. return $out;
  293. }
  294. // ------------------------------------------------------------------------
  295. /**
  296. * Entities to ASCII
  297. *
  298. * Converts character entities back to ASCII
  299. *
  300. * @param string
  301. * @param boolean
  302. * @return string
  303. */
  304. public function entities_to_ascii($str, $all = true) {
  305. if (preg_match_all('/\&#(\d+)\;/', $str, $matches)) {
  306. for ($i = 0, $s = count($matches['0']); $i < $s; $i++) {
  307. $digits = $matches['1'][$i];
  308. $out = '';
  309. if ($digits < 128) {
  310. $out .= chr($digits);
  311. } elseif ($digits < 2048) {
  312. $out .= chr(192 + (($digits - ($digits % 64)) / 64));
  313. $out .= chr(128 + ($digits % 64));
  314. } else {
  315. $out .= chr(224 + (($digits - ($digits % 4096)) / 4096));
  316. $out .= chr(128 + ((($digits % 4096) - ($digits % 64)) / 64));
  317. $out .= chr(128 + ($digits % 64));
  318. }
  319. $str = str_replace($matches['0'][$i], $out, $str);
  320. }
  321. }
  322. if ($all) {
  323. $str = str_replace(array("&amp;", "&lt;", "&gt;", "&quot;", "&apos;", "&#45;"),
  324. array("&", "<", ">", "\"", "'", "-"), $str);
  325. }
  326. return $str;
  327. }
  328. /**
  329. * Reduce Double Slashes
  330. *
  331. * Converts double slashes in a string to a single slash,
  332. * except those found in http://
  333. *
  334. * http://www.some-site.com//index.php
  335. *
  336. * becomes:
  337. *
  338. * http://www.some-site.com/index.php
  339. *
  340. * @param string
  341. * @return string
  342. */
  343. public function reduce_double_slashes($str) {
  344. return preg_replace("#([^:])//+#", "\\1/", $str);
  345. }
  346. // ------------------------------------------------------------------------
  347. /**
  348. * Reduce Multiples
  349. *
  350. * Reduces multiple instances of a particular character. Example:
  351. *
  352. * Fred, Bill,, Joe, Jimmy
  353. *
  354. * becomes:
  355. *
  356. * Fred, Bill, Joe, Jimmy
  357. *
  358. * @param string
  359. * @param string the character you wish to reduce
  360. * @param boolean TRUE/FALSE - whether to trim the character from the beginning/end
  361. * @return string
  362. */
  363. public function reduce_multiples($str, $character = ',', $trim = false) {
  364. $str = preg_replace('#' . preg_quote($character, '#') . '{2,}#', $character, $str);
  365. if ($trim === true) {
  366. $str = trim($str, $character);
  367. }
  368. return $str;
  369. }
  370. }