Spade
Mini Shell
| Directory:~$ /home/lmsyaran/public_html/libraries/regularlabs/src/ |
| [Home] [System Details] [Kill Me] |
<?php
/**
* @package Regular Labs Library
* @version 21.2.19653
*
* @author Peter van Westen <info@regularlabs.com>
* @link http://www.regularlabs.com
* @copyright Copyright © 2021 Regular Labs All Rights Reserved
* @license http://www.gnu.org/licenses/gpl-2.0.html GNU/GPL
*/
namespace RegularLabs\Library;
defined('_JEXEC') or die;
use DOMDocument;
/**
* Class Html
* @package RegularLabs\Library
*/
class Html
{
/**
* Convert content saved in a WYSIWYG editor to plain text (like removing
html tags)
*
* @param $string
*
* @return string
*/
public static function convertWysiwygToPlainText($string)
{
// replace chr style enters with normal enters
$string = str_replace([chr(194) . chr(160), ' ',
' '], ' ', $string);
// replace linebreak tags with normal linebreaks (paragraphs, enters,
etc).
$enter_tags = ['p', 'br'];
$regex = '</?((' . implode(')|(',
$enter_tags) . '))+[^>]*?>\n?';
$string = RegEx::replace($regex, " \n", $string);
// replace indent characters with spaces
$string = RegEx::replace('<img
[^>]*/sourcerer/images/tab\.png[^>]*>', ' ',
$string);
// strip all other tags
$regex =
'<(/?\w+((\s+\w+(\s*=\s*(?:".*?"|\'.*?\'|[^\'">\s]+))?)+\s*|\s*)/?)>';
$string = RegEx::replace($regex, '', $string);
// reset htmlentities
$string = StringHelper::html_entity_decoder($string);
// convert protected html entities &_...; -> &...;
$string = RegEx::replace('&_([a-z0-9\#]+?);',
'&\1;', $string);
return $string;
}
/**
* Extract the <body>...</body> part from an entire html
output string
*
* @param string $html
*
* @return array
*/
public static function getBody($html, $include_body_tag = true)
{
if (strpos($html, '<body') === false || strpos($html,
'</body>') === false)
{
return ['', $html, ''];
}
// Force string to UTF-8
$html = StringHelper::convertToUtf8($html);
$split = explode('<body', $html, 2);
$pre = $split[0];
$split = explode('>', $split[1], 2);
$body_start = '<body' . $split[0] . '>';
$body_end = '</body>';
$split = explode('</body>', $split[1]);
$post = array_pop($split);
$body = implode('</body>', $split);
if ( ! $include_body_tag)
{
return [
$pre . $body_start,
$body,
$body_end . $post,
];
}
return [
$pre,
$body_start . $body . $body_end,
$post,
];
}
/**
* Search the string for the start and end searches and split the string
in a pre, body and post part
* This is used to be able to do replacements on the body part, which will
be lighter than doing it on the entire string
*
* @param string $string
* @param array $start_searches
* @param array $end_searches
* @param int $start_offset
* @param null $end_offset
*
* @return array
*/
public static function getContentContainingSearches($string,
$start_searches = [], $end_searches = [], $start_offset = 1000, $end_offset
= null)
{
// String is too short to split and search through
if (strlen($string) < 2000)
{
return ['', $string, ''];
}
$end_offset = is_null($end_offset) ? $start_offset : $end_offset;
$found = false;
$start_split = strlen($string);
foreach ($start_searches as $search)
{
$pos = strpos($string, $search);
if ($pos === false)
{
continue;
}
$start_split = min($start_split, $pos);
$found = true;
}
// No searches are found
if ( ! $found)
{
return [$string, '', ''];
}
// String is too short to split
if (strlen($string) < ($start_offset + $end_offset + 1000))
{
return ['', $string, ''];
}
$start_split = max($start_split - $start_offset, 0);
$pre = substr($string, 0, $start_split);
$string = substr($string, $start_split);
self::fixBrokenTagsByPreString($pre, $string);
if (empty($end_searches))
{
$end_searches = $start_searches;
}
$end_split = 0;
$found = false;
foreach ($end_searches as $search)
{
$pos = strrpos($string, $search);
if ($pos === false)
{
continue;
}
$end_split = max($end_split, $pos + strlen($search));
$found = true;
}
// No end split is found, so don't split remainder
if ( ! $found)
{
return [$pre, $string, ''];
}
$end_split = min($end_split + $end_offset, strlen($string));
$post = substr($string, $end_split);
$string = substr($string, 0, $end_split);
self::fixBrokenTagsByPostString($post, $string);
return [$pre, $string, $post];
}
/**
* Check if string contains block elements
*
* @param string $string
*
* @return string
*/
public static function containsBlockElements($string)
{
return RegEx::match('</?(' . implode('|',
self::getBlockElements()) . ')(?: [^>]*)?>', $string);
}
/**
* Fix broken/invalid html syntax in a string
*
* @param string $string
*
* @return string
*/
public static function fix($string)
{
if ( ! self::containsBlockElements($string))
{
return $string;
}
// Convert utf8 characters to html entities
if (function_exists('mb_convert_encoding'))
{
$string = mb_convert_encoding($string, 'html-entities',
'utf-8');
}
$string = self::protectSpecialCode($string);
$string = self::convertDivsInsideInlineElementsToSpans($string);
$string = self::removeParagraphsAroundBlockElements($string);
$string = self::removeInlineElementsAroundBlockElements($string);
$string = self::fixParagraphsAroundParagraphElements($string);
$string = class_exists('DOMDocument')
? self::fixUsingDOMDocument($string)
: self::fixUsingCustomFixer($string);
$string = self::unprotectSpecialCode($string);
// Convert html entities back to utf8 characters
if (function_exists('mb_convert_encoding'))
{
// Make sure < and > don't get converted
$string = str_replace(['<', '>'],
['&lt;', '&gt;'], $string);
$string = mb_convert_encoding($string, 'utf-8',
'html-entities');
}
$string = self::removeParagraphsAroundComments($string);
return $string;
}
/**
* Fix broken/invalid html syntax in an array of strings
*
* @param array $array
*
* @return array
*/
public static function fixArray($array)
{
$splitter = ':|:';
$string = self::fix(implode($splitter, $array));
$parts = self::removeEmptyTags(explode($splitter, $string));
// use original keys but new values
return array_combine(array_keys($array), $parts);
}
/**
* Removes empty tags which span concatenating parts in the array
*
* @param array $array
*
* @return array
*/
public static function removeEmptyTags($array)
{
$splitter = ':|:';
$comments = '(?:\s*<\!--[^>]*-->\s*)*';
$string = implode($splitter, $array);
Protect::protectHtmlCommentTags($string);
$string = RegEx::replace(
'<([a-z][a-z0-9]*)(?: [^>]*)?>\s*(' . $comments .
RegEx::quote($splitter) . $comments . ')\s*</\1>',
'\2',
$string
);
Protect::unprotect($string);
return explode($splitter, $string);
}
/**
* Fix broken/invalid html syntax in a string using php DOMDocument
functionality
*
* @param string $string
*
* @return mixed
*/
private static function fixUsingDOMDocument($string)
{
$doc = new DOMDocument;
$doc->substituteEntities = false;
list($pre, $body, $post) = Html::getBody($string, false);
// Add temporary document structures
$body = '<html><body><div>' . $body .
'</div></body></html>';
@$doc->loadHTML($body);
$body = $doc->saveHTML();
if (strpos($doc->documentElement->textContent, 'Ã') !==
false)
{
// Need to do this utf8 workaround to deal with special characters
// DOMDocument doesn't seem to deal with them very well
// See:
https://stackoverflow.com/questions/8218230/php-domdocument-loadhtml-not-encoding-utf-8-correctly/47396055#47396055
$body = utf8_decode($doc->saveHTML($doc->documentElement));
}
// Remove temporary document structures and surrounding div
$body =
RegEx::replace('^.*?<html>.*?(?:<head>(.*)</head>.*?)?<body>\s*<div>(.*)</div>\s*</body>.*?$',
'\1\2', $body);
// Remove leading/trailing empty paragraph
$body =
RegEx::replace('(^\s*<div>\s*</div>|<div>\s*</div>\s*$)',
'', $body);
// Remove leading/trailing empty paragraph
$body =
RegEx::replace('(^\s*<div>\s*</div>|<div>\s*</div>\s*$)',
'', $body);
// Remove leading/trailing empty paragraph
$body = RegEx::replace('(^\s*<p(?:
[^>]*)?>\s*</p>|<p(?:
[^>]*)?>\s*</p>\s*$)', '', $body);
return $pre . $body . $post;
}
/**
* Fix broken/invalid html syntax in a string using custom code as an
alternative to php DOMDocument functionality
*
* @param string $string
*
* @return string
*/
private static function fixUsingCustomFixer($string)
{
$block_regex = '<(' . implode('|',
self::getBlockElementsNoDiv()) . ')[\s>]';
$string = RegEx::replace('(' . $block_regex . ')',
'[:SPLIT-BLOCK:]\1', $string);
$parts = explode('[:SPLIT-BLOCK:]', $string);
foreach ($parts as $i => &$part)
{
if ( ! RegEx::match('^' . $block_regex, $part, $type))
{
continue;
}
$type = strtolower($type[1]);
// remove endings of other block elements
$part = RegEx::replace('</(?:' . implode('|',
self::getBlockElementsNoDiv($type)) . ')>', '',
$part);
if (strpos($part, '</' . $type . '>') !==
false)
{
continue;
}
// Add ending tag once
$part = RegEx::replaceOnce('(\s*)$', '</' . $type
. '>\1', $part);
// Remove empty block tags
$part = RegEx::replace('^<' . $type . '(?:
[^>]*)?>\s*</' . $type . '>', '',
$part);
}
return implode('', $parts);
}
/**
* Removes complete html tag pairs from the concatenated parts
*
* @param array $parts
* @param array $elements
*
* @return array
*/
public static function cleanSurroundingTags($parts, $elements =
['p', 'span'])
{
$breaks = '(?:(?:<br
?/?>|<\!--[^>]*-->|:\|:)\s*)*';
$keys = array_keys($parts);
$string = implode(':|:', $parts);
Protect::protectHtmlCommentTags($string);
// Remove empty tags
$regex = '<(' . implode('|', $elements) .
')(?: [^>]*)?>\s*(' . $breaks .
')<\/\1>\s*';
while (RegEx::match($regex, $string, $match))
{
$string = str_replace($match[0], $match[2], $string);
}
// Remove paragraphs around block elements
$block_elements = [
'p', 'div',
'table', 'tr', 'td', 'thead',
'tfoot',
'h[1-6]',
];
$block_elements = '(' . implode('|', $block_elements)
. ')';
$regex = '(<p(?: [^>]*)?>)(\s*' . $breaks .
')(<' . $block_elements . '(?: [^>]*)?>)';
while (RegEx::match($regex, $string, $match))
{
if ($match[4] == 'p')
{
$match[3] = $match[1] . $match[3];
self::combinePTags($match[3]);
}
$string = str_replace($match[0], $match[2] . $match[3], $string);
}
$regex = '(</' . $block_elements . '>\s*' .
$breaks . ')</p>';
while (RegEx::match($regex, $string, $match))
{
$string = str_replace($match[0], $match[1], $string);
}
Protect::unprotect($string);
$parts = explode(':|:', $string);
$new_tags = [];
foreach ($parts as $key => $val)
{
$key = isset($keys[$key]) ? $keys[$key] : $key;
$new_tags[$key] = $val;
}
return $new_tags;
}
/**
* Remove <p> tags around block elements
*
* @param string $string
*
* @return mixed
*/
private static function removeParagraphsAroundBlockElements($string)
{
if (strpos($string, '</p>') == false)
{
return $string;
}
Protect::protectHtmlCommentTags($string);
$string = RegEx::replace(
'<p(?: [^>]*)?>\s*'
. '((?:<\!--[^>]*-->\s*)*</?(?:' .
implode('|', self::getBlockElements()) . ')' .
'(?: [^>]*)?>)',
'\1',
$string
);
$string = RegEx::replace(
'(</?(?:' . implode('|',
self::getBlockElements()) . ')' . '(?:
[^>]*)?>(?:\s*<\!--[^>]*-->)*)'
. '(?:\s*</p>)',
'\1',
$string
);
Protect::unprotect($string);
return $string;
}
/**
* Remove <p> tags around comments
*
* @param string $string
*
* @return mixed
*/
private static function removeParagraphsAroundComments($string)
{
if (strpos($string, '</p>') == false)
{
return $string;
}
Protect::protectHtmlCommentTags($string);
$string = RegEx::replace(
'(?:<p(?: [^>]*)?>\s*)'
. '(<\!--[^>]*-->)'
. '(?:\s*</p>)',
'\1',
$string
);
Protect::unprotect($string);
return $string;
}
/**
* Fix <p> tags around other <p> elements
*
* @param string $string
*
* @return mixed
*/
private static function fixParagraphsAroundParagraphElements($string)
{
if (strpos($string, '</p>') == false)
{
return $string;
}
$parts = explode('</p>', $string);
$ending = '</p>' . array_pop($parts);
foreach ($parts as &$part)
{
if (strpos($part, '<p>') === false &&
strpos($part, '<p ') === false)
{
$part = '<p>' . $part;
continue;
}
$part = RegEx::replace(
'(<p(?: [^>]*)?>.*?)(<p(?: [^>]*)?>)',
'\1</p>\2',
$part
);
}
return implode('</p>', $parts) . $ending;
}
/*
* Remove empty tags
*
* @param string $string
* @param array $elements
*
* @return mixed
*/
public static function removeEmptyTagPairs($string, $elements =
['p', 'span'])
{
$breaks = '(?:(?:<br ?/?>|<\!--[^>]*-->)\s*)*';
$regex = '<(' . implode('|', $elements) .
')(?: [^>]*)?>\s*(' . $breaks .
')<\/\1>\s*';
Protect::protectHtmlCommentTags($string);
while (RegEx::match($regex, $string, $match))
{
$string = str_replace($match[0], $match[2], $string);
}
Protect::unprotect($string);
return $string;
}
/**
* Convert <div> tags inside inline elements to <span> tags
*
* @param string $string
*
* @return mixed
*/
private static function convertDivsInsideInlineElementsToSpans($string)
{
if (strpos($string, '</div>') == false)
{
return $string;
}
// Ignore block elements inside anchors
$regex = '<(' . implode('|',
self::getInlineElementsNoAnchor()) . ')(?:
[^>]*)?>.*?</\1>';
RegEx::matchAll($regex, $string, $matches, '',
PREG_PATTERN_ORDER);
if (empty($matches))
{
return $string;
}
$matches = array_unique($matches[0]);
$searches = [];
$replacements = [];
foreach ($matches as $match)
{
if (strpos($match, '</div>') === false)
{
continue;
}
$searches[] = $match;
$replacements[] = str_replace(
['<div>', '<div ',
'</div>'],
['<span>', '<span ',
'</span>'],
$match
);
}
if (empty($searches))
{
return $string;
}
return str_replace($searches, $replacements, $string);
}
/**
* Combine duplicate <p> tags
* input: <p class="aaa" a="1"><!-- ...
--><p class="bbb" b="2">
* output: <p class="aaa bbb" a="1"
b="2"><!-- ... -->
*
* @param $string
*/
public static function combinePTags(&$string)
{
if (empty($string))
{
return;
}
$p_start_tag = '<p(?: [^>]*)?>';
$optional_tags =
'\s*(?:<\!--[^>]*-->| |&\#160;)*\s*';
Protect::protectHtmlCommentTags($string);
RegEx::matchAll('(' . $p_start_tag . ')(' .
$optional_tags . ')(' . $p_start_tag . ')', $string,
$tags);
if (empty($tags))
{
Protect::unprotect($string);
return;
}
foreach ($tags as $tag)
{
$string = str_replace($tag[0], $tag[2] . HtmlTag::combine($tag[1],
$tag[3]), $string);
}
Protect::unprotect($string);
}
/**
* Remove inline elements around block elements
*
* @param string $string
*
* @return mixed
*/
public static function removeInlineElementsAroundBlockElements($string)
{
$string = RegEx::replace(
'(?:<(?:' . implode('|',
self::getInlineElementsNoAnchor()) . ')(?: [^>]*)?>\s*)'
. '(</?(?:' . implode('|',
self::getBlockElements()) . ')(?: [^>]*)?>)',
'\1',
$string
);
$string = RegEx::replace(
'(</?(?:' . implode('|',
self::getBlockElements()) . ')(?: [^>]*)?>)'
. '(?:\s*</(?:' . implode('|',
self::getInlineElementsNoAnchor()) . ')>)',
'\1',
$string
);
return $string;
}
/**
* Return an array of block element names, optionally without any of the
names given $exclude
*
* @param array $exclude
*
* @return array
*/
public static function getBlockElements($exclude = [])
{
if ( ! is_array($exclude))
{
$exclude = [$exclude];
}
$elements = [
'div', 'p', 'pre',
'h1', 'h2', 'h3', 'h4',
'h5', 'h6',
];
$elements = array_diff($elements, $exclude);
$elements = implode(',', $elements);
$elements = str_replace('h1,h2,h3,h4,h5,h6',
'h[1-6]', $elements);
$elements = explode(',', $elements);
return $elements;
}
/**
* Return an array of inline element names, optionally without any of the
names given $exclude
*
* @param array $exclude
*
* @return array
*/
public static function getInlineElements($exclude = [])
{
if ( ! is_array($exclude))
{
$exclude = [$exclude];
}
$elements = [
'span', 'code', 'a',
'strong', 'b', 'em', 'i',
'u', 'big', 'small', 'font',
'sup', 'sub',
];
return array_diff($elements, $exclude);
}
/**
* Return an array of block element names, without divs and any of the
names given $exclude
*
* @param array $exclude
*
* @return array
*/
public static function getBlockElementsNoDiv($exclude = [])
{
return array_diff(self::getBlockElements($exclude), ['div']);
}
/**
* Return an array of block element names, without anchors (a) and any of
the names given $exclude
*
* @param array $exclude
*
* @return array
*/
public static function getInlineElementsNoAnchor($exclude = [])
{
return array_diff(self::getInlineElements($exclude), ['a']);
}
/**
* Protect plugin style tags and php
*
* @param $string
*
* @return mixed
*/
private static function protectSpecialCode($string)
{
// Protect PHP code
Protect::protectByRegex($string,
'(<|<)\?php\s.*?\?(>|>)');
// Protect {...} tags
Protect::protectByRegex($string, '\{[a-z0-9].*?\}');
// Protect [...] tags
Protect::protectByRegex($string, '\[[a-z0-9].*?\]');
// Protect scripts
Protect::protectByRegex($string,
'<script[^>]*>.*?</script>');
// Protect css
Protect::protectByRegex($string,
'<style[^>]*>.*?</style>');
Protect::convertProtectionToHtmlSafe($string);
return $string;
}
/**
* Unprotect protected tags
*
* @param $string
*
* @return mixed
*/
private static function unprotectSpecialCode($string)
{
Protect::unprotectHtmlSafe($string);
return $string;
}
/**
* Prevents broken html tags at the end of $pre (other half at beginning
of $string)
* It will move the broken part to the beginning of $string to complete it
*
* @param $pre
* @param $string
*/
private static function fixBrokenTagsByPreString(&$pre, &$string)
{
if ( !
RegEx::match('<(\![^>]*|/?[a-z][^>]*(="[^"]*)?)$',
$pre, $match))
{
return;
}
$pre = substr($pre, 0, strlen($pre) - strlen($match[0]));
$string = $match[0] . $string;
}
/**
* Prevents broken html tags at the beginning of $pre (other half at end
of $string)
* It will move the broken part to the end of $string to complete it
*
* @param $post
* @param $string
*/
private static function fixBrokenTagsByPostString(&$post,
&$string)
{
if ( !
RegEx::match('<(\![^>]*|/?[a-z][^>]*(="[^"]*)?)$',
$string, $match))
{
return;
}
if ( ! RegEx::match('^[^>]*>', $post, $match))
{
return;
}
$post = substr($post, strlen($match[0]));
$string .= $match[0];
}
/**
* Removes html tags from string
*
* @param string $string
* @param bool $remove_comments
*
* @return string
*/
public static function removeHtmlTags($string, $remove_comments = false)
{
// remove pagenavcounter
$string = RegEx::replace('<div
class="pagenavcounter">.*?</div>', ' ',
$string);
// remove pagenavbar
$string = RegEx::replace('<div
class="pagenavbar">(<div>.*?</div>)*</div>',
' ', $string);
// remove inline scripts
$string =
RegEx::replace('<script[^a-z0-9].*?</script>',
'', $string);
$string =
RegEx::replace('<noscript[^a-z0-9].*?</noscript>',
'', $string);
// remove inline styles
$string = RegEx::replace('<style[^a-z0-9].*?</style>',
'', $string);
// remove inline html tags
$string = RegEx::replace(
'</?(' . implode('|', self::getInlineElements())
. ')( [^>]*)?>',
'',
$string
);
if ($remove_comments)
{
// remove html comments
$string = RegEx::replace('<!--.*?-->', ' ',
$string);
}
// replace other tags with a space
$string = RegEx::replace('</?[a-z].*?>', ' ',
$string);
// remove double whitespace
$string = trim(RegEx::replace('(\s)[ ]+', '\1',
$string));
return $string;
}
}