rss-bridge/bridges/WordPressBridge.php
ORelio 4520ab6835
[WordPressBridge] Improve content extraction (#3125)
* [WordPressBridge] Improve content extraction

 - Pick up currently unmaintained bridge
 - Allow Custom item limit and lower default limit from 20 to 10
 - Allow Custom content selector for blogs with non-standard templates (#2173)
 - Remove content selector made for one specific blog (#2173 - can be a custom selector now)
 - Add '.article-content' class in the set of default selectors
 - Improve lazy-loading conversion

* [WordPressBridge] Fix phpcs issues
2022-10-31 20:59:19 +01:00

138 lines
5.4 KiB
PHP

<?php
class WordPressBridge extends FeedExpander
{
const NAME = 'Wordpress Bridge';
const URI = 'https://wordpress.org/';
const DESCRIPTION = 'Returns the newest full posts of a WordPress powered website';
const MAINTAINER = 'ORelio';
const PARAMETERS = [ [
'url' => [
'name' => 'Blog URL',
'exampleValue' => 'https://wordpress.org/',
'required' => true
],
'limit' => self::LIMIT,
'content-selector' => [
'name' => 'Content Selector (Optional - Advanced users)',
'exampleValue' => '.custom-article-class',
],
]];
private function cleanContent($content)
{
$content = stripWithDelimiters($content, '<script', '</script>');
$content = preg_replace('/<div class="wpa".*/', '', $content);
$content = preg_replace('/<form.*\/form>/', '', $content);
return $content;
}
protected function parseItem($newItem)
{
$item = parent::parseItem($newItem);
$article_html = getSimpleHTMLDOMCached($item['uri']);
// Find article body
$article = null;
switch (true) {
case !empty($this->getInput('content-selector')):
// custom contect selector (manually specified by user)
$article = $article_html->find($this->getInput('content-selector'), 0);
break;
case !is_null($article_html->find('[itemprop=articleBody]', 0)):
// highest priority content div (used for SEO)
$article = $article_html->find('[itemprop=articleBody]', 0);
break;
case !is_null($article_html->find('.article-content', 0)):
// more precise than article when present
$article = $article_html->find('.article-content', 0);
break;
case !is_null($article_html->find('article', 0)):
// most common content div
$article = $article_html->find('article', 0);
break;
case !is_null($article_html->find('.single-content', 0)):
// another common content div
$article = $article_html->find('.single-content', 0);
break;
case !is_null($article_html->find('.post-content', 0)):
// another common content div
$article = $article_html->find('.post-content', 0);
break;
case !is_null($article_html->find('.post', 0)):
// for old WordPress themes without HTML5
$article = $article_html->find('.post', 0);
break;
}
// Remove duplicate title from content
foreach ($article->find('h1') as $title) {
if (trim(html_entity_decode($title->plaintext) == $item['title'])) {
$title->outertext = '';
}
}
// Convert lazy-loading images and iframes (videos...)
foreach ($article->find('img, iframe') as $img) {
if (!empty($img->getAttribute('data-src'))) {
$img->src = $img->getAttribute('data-src');
} elseif (!empty($img->getAttribute('data-srcset'))) {
$img->src = explode(' ', $img->getAttribute('data-srcset'))[0];
} elseif (!empty($img->getAttribute('data-lazy-src'))) {
$img->src = $img->getAttribute('data-lazy-src');
} elseif (!empty($img->getAttribute('srcset'))) {
$img->src = explode(' ', $img->getAttribute('srcset'))[0];
}
}
// Find article main image
$article_image = $article_html->find('img.wp-post-image', 0);
if (!empty($item['content']) && (!is_object($article_image) || empty($article_image->src))) {
$article_image = str_get_html($item['content'])->find('img.wp-post-image', 0);
}
if (is_object($article_image) && !empty($article_image->src)) {
$article_image = $article_image->src;
$mime_type = parse_mime_type($article_image);
if (strpos($mime_type, 'image') === false) {
$article_image .= '#.image'; // force image
}
if (empty($item['enclosures'])) {
$item['enclosures'] = [$article_image];
} else {
$item['enclosures'] = array_merge($item['enclosures'], (array) $article_image);
}
}
if (!is_null($article)) {
$item['content'] = $this->cleanContent($article->innertext);
$item['content'] = defaultLinkTo($item['content'], $item['uri']);
}
return $item;
}
public function getURI()
{
$url = $this->getInput('url');
if (empty($url)) {
$url = parent::getURI();
}
return $url;
}
public function collectData()
{
$limit = $this->getInput('limit') ?? 10;
if ($this->getInput('url') && substr($this->getInput('url'), 0, strlen('http')) !== 'http') {
// just in case someone find a way to access local files by playing with the url
returnClientError('The url parameter must either refer to http or https protocol.');
}
try {
$this->collectExpandableDatas($this->getURI() . '/feed/atom/', $limit);
} catch (Exception $e) {
$this->collectExpandableDatas($this->getURI() . '/?feed=atom', $limit);
}
}
}