From 54800fcc8d43d34294b661e52c7311d7b2df01b7 Mon Sep 17 00:00:00 2001 From: logmanoriginal Date: Sun, 18 Nov 2018 17:32:01 +0100 Subject: [PATCH] [html] Clarify meaning of strange find() parameter simple_html_dom currently doesnt support "->find('*')", which is a known issue: https://sourceforge.net/p/simplehtmldom/bugs/157/ The solution implemented by RSS-Bridge is to find all nodes WITHOUT a specific attribute. If the attribute is very unlikely to appear in the DOM, this is essentially returning all nodes. This is the meaning behind "->find('*[!b38fd2b1fe7f4747d6b1c1254ccd055e]')" --- lib/html.php | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/lib/html.php b/lib/html.php index a9eccef6..c8bf8d9c 100644 --- a/lib/html.php +++ b/lib/html.php @@ -26,8 +26,6 @@ * already removes some of the tags (search for `remove_noise` in simple_html_dom.php). * @todo Rename parameters to make more sense. `$textToSanitize` must be HTML, * `$removedTags`, `$keptAttributes` and `$keptText` are past tense. - * @todo Clarify the meaning of `*[!b38fd2b1fe7f4747d6b1c1254ccd055e]`, which - * looks like a SHA1 hash (does simplehtmldom not support `find('*')`?). */ function sanitize($textToSanitize, $removedTags = array('script', 'iframe', 'input', 'form'), @@ -35,6 +33,17 @@ $keptAttributes = array('title', 'href', 'src'), $keptText = array()){ $htmlContent = str_get_html($textToSanitize); + /* + * Notice: simple_html_dom currently doesn't support "->find(*)", which is a + * known issue: https://sourceforge.net/p/simplehtmldom/bugs/157/ + * + * A solution to this is to find all nodes WITHOUT a specific attribute. If + * the attribute is very unlikely to appear in the DOM, this is essentially + * returning all nodes. + * + * "*[!b38fd2b1fe7f4747d6b1c1254ccd055e]" is doing exactly that. The attrib + * "b38fd2b1fe7f4747d6b1c1254ccd055e" is very unlikely to appear in any DOM. + */ foreach($htmlContent->find('*[!b38fd2b1fe7f4747d6b1c1254ccd055e]') as $element) { if(in_array($element->tag, $keptText)) { $element->outertext = $element->plaintext; @@ -76,15 +85,23 @@ $keptText = array()){ * * @param string $htmlContent The HTML content * @return string The HTML content with all ocurrences replaced - * - * @todo Clarify the meaning of `*[!b38fd2b1fe7f4747d6b1c1254ccd055e]`, which - * looks like a SHA1 hash (does simplehtmldom not support `find('*')`?). */ function backgroundToImg($htmlContent) { $regex = '/background-image[ ]{0,}:[ ]{0,}url\([\'"]{0,}(.*?)[\'"]{0,}\)/'; $htmlContent = str_get_html($htmlContent); + /* + * Notice: simple_html_dom currently doesn't support "->find(*)", which is a + * known issue: https://sourceforge.net/p/simplehtmldom/bugs/157/ + * + * A solution to this is to find all nodes WITHOUT a specific attribute. If + * the attribute is very unlikely to appear in the DOM, this is essentially + * returning all nodes. + * + * "*[!b38fd2b1fe7f4747d6b1c1254ccd055e]" is doing exactly that. The attrib + * "b38fd2b1fe7f4747d6b1c1254ccd055e" is very unlikely to appear in any DOM. + */ foreach($htmlContent->find('*[!b38fd2b1fe7f4747d6b1c1254ccd055e]') as $element) { if(preg_match($regex, $element->style, $matches) > 0) {