From 2880524dfc7685985fde8429c1dcb85387f4ba14 Mon Sep 17 00:00:00 2001 From: Dag <me@dvikan.no> Date: Fri, 13 Oct 2023 01:59:05 +0200 Subject: [PATCH] refactor: remove parent calls to parseItem (#3747) --- bridges/AcrimedBridge.php | 4 +- bridges/ArsTechnicaBridge.php | 4 +- bridges/BleepingComputerBridge.php | 4 +- bridges/CNETFranceBridge.php | 4 +- bridges/CaschyBridge.php | 4 +- bridges/CommonDreamsBridge.php | 3 +- bridges/CourrierInternationalBridge.php | 4 +- bridges/DarkReadingBridge.php | 4 +- bridges/DauphineLibereBridge.php | 3 +- bridges/DeutscheWelleBridge.php | 4 +- bridges/DeveloppezDotComBridge.php | 4 +- bridges/EconomistBridge.php | 3 +- bridges/EngadgetBridge.php | 4 +- bridges/EsquerdaNetBridge.php | 4 +- bridges/FeedExpanderTestBridge.php | 6 +- bridges/FilterBridge.php | 4 +- bridges/FolhaDeSaoPauloBridge.php | 4 +- bridges/ForGifsBridge.php | 4 +- bridges/FreeCodeCampBridge.php | 4 +- bridges/FuturaSciencesBridge.php | 4 +- bridges/GizmodoBridge.php | 4 +- bridges/GolemBridge.php | 3 +- bridges/HardwareInfoBridge.php | 4 +- bridges/HeiseBridge.php | 4 +- bridges/IGNBridge.php | 4 +- bridges/KoreusBridge.php | 4 +- bridges/LeMondeInformatiqueBridge.php | 4 +- bridges/ListverseBridge.php | 3 +- bridges/MediapartBridge.php | 4 +- bridges/MsnMondeBridge.php | 4 +- bridges/NYTBridge.php | 4 +- bridges/NextInpactBridge.php | 4 +- bridges/NextgovBridge.php | 4 +- bridges/NiceMatinBridge.php | 4 +- bridges/OnVaSortirBridge.php | 3 +- bridges/PhoronixBridge.php | 4 +- bridges/QwantzBridge.php | 6 +- bridges/RaceDepartmentBridge.php | 4 +- bridges/ScribbleHubBridge.php | 4 +- bridges/SplCenterBridge.php | 4 +- bridges/TapasBridge.php | 4 +- bridges/TheGuardianBridge.php | 4 +- bridges/TwitterEngineeringBridge.php | 4 +- bridges/VarietyBridge.php | 4 +- bridges/ViceBridge.php | 4 +- bridges/WeLiveSecurityBridge.php | 4 +- bridges/WiredBridge.php | 4 +- bridges/WordPressBridge.php | 4 +- bridges/WorldOfTanksBridge.php | 3 +- bridges/ZDNetBridge.php | 4 +- bridges/ZeitBridge.php | 4 +- docs/05_Bridge_API/03_FeedExpander.md | 90 ++++++------------------- docs/05_Bridge_API/index.md | 2 +- lib/FeedExpander.php | 88 +++++------------------- lib/FeedParser.php | 8 +-- 55 files changed, 96 insertions(+), 293 deletions(-) diff --git a/bridges/AcrimedBridge.php b/bridges/AcrimedBridge.php index 93890f35..f7bbd58e 100644 --- a/bridges/AcrimedBridge.php +++ b/bridges/AcrimedBridge.php @@ -25,10 +25,8 @@ class AcrimedBridge extends FeedExpander $this->collectExpandableDatas($url, $limit); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $articlePage = getSimpleHTMLDOM($item['uri']); $article = sanitize($articlePage->find('article.article1', 0)->innertext); $article = defaultLinkTo($article, static::URI); diff --git a/bridges/ArsTechnicaBridge.php b/bridges/ArsTechnicaBridge.php index 98e5566b..d15cfb4f 100644 --- a/bridges/ArsTechnicaBridge.php +++ b/bridges/ArsTechnicaBridge.php @@ -33,10 +33,8 @@ class ArsTechnicaBridge extends FeedExpander $this->collectExpandableDatas($url); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $item_html = getSimpleHTMLDOMCached($item['uri'] . '&'); $item_html = defaultLinkTo($item_html, self::URI); $item['content'] = $item_html->find('.amp-wp-article-content', 0); diff --git a/bridges/BleepingComputerBridge.php b/bridges/BleepingComputerBridge.php index bad78561..79d84176 100644 --- a/bridges/BleepingComputerBridge.php +++ b/bridges/BleepingComputerBridge.php @@ -13,10 +13,8 @@ class BleepingComputerBridge extends FeedExpander $this->collectExpandableDatas($feed); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $article_html = getSimpleHTMLDOMCached($item['uri']); if (!$article_html) { $item['content'] .= '<p><em>Could not request ' . $this->getName() . ': ' . $item['uri'] . '</em></p>'; diff --git a/bridges/CNETFranceBridge.php b/bridges/CNETFranceBridge.php index d6a766de..35e92daf 100644 --- a/bridges/CNETFranceBridge.php +++ b/bridges/CNETFranceBridge.php @@ -43,10 +43,8 @@ class CNETFranceBridge extends FeedExpander $this->collectExpandableDatas('https://www.cnetfrance.fr/feeds/rss/news/'); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - foreach ($this->bannedTitle as $term) { if (preg_match('/' . $term . '/mi', $item['title']) === 1) { return null; diff --git a/bridges/CaschyBridge.php b/bridges/CaschyBridge.php index 7d632bf6..0e3a07bc 100644 --- a/bridges/CaschyBridge.php +++ b/bridges/CaschyBridge.php @@ -34,10 +34,8 @@ class CaschyBridge extends FeedExpander ); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - if (strpos($item['uri'], 'https://stadt-bremerhaven.de/') !== 0) { return $item; } diff --git a/bridges/CommonDreamsBridge.php b/bridges/CommonDreamsBridge.php index e1a185de..34532284 100644 --- a/bridges/CommonDreamsBridge.php +++ b/bridges/CommonDreamsBridge.php @@ -12,9 +12,8 @@ class CommonDreamsBridge extends FeedExpander $this->collectExpandableDatas('http://www.commondreams.org/rss.xml', 10); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); $item['content'] = $this->extractContent($item['uri']); return $item; } diff --git a/bridges/CourrierInternationalBridge.php b/bridges/CourrierInternationalBridge.php index 9e30fd51..3d9889b0 100644 --- a/bridges/CourrierInternationalBridge.php +++ b/bridges/CourrierInternationalBridge.php @@ -13,10 +13,8 @@ class CourrierInternationalBridge extends FeedExpander $this->collectExpandableDatas(static::URI . 'feed/all/rss.xml', 20); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $articlePage = getSimpleHTMLDOMCached($item['uri']); $content = $articlePage->find('.article-text, depeche-text', 0); if (!$content) { diff --git a/bridges/DarkReadingBridge.php b/bridges/DarkReadingBridge.php index aca30490..4f1622e3 100644 --- a/bridges/DarkReadingBridge.php +++ b/bridges/DarkReadingBridge.php @@ -56,10 +56,8 @@ class DarkReadingBridge extends FeedExpander $this->collectExpandableDatas($feed_url, $limit); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $article = getSimpleHTMLDOMCached($item['uri']); $item['content'] = $this->extractArticleContent($article); $item['enclosures'] = []; //remove author profile picture diff --git a/bridges/DauphineLibereBridge.php b/bridges/DauphineLibereBridge.php index 0ab808cd..05748a5d 100644 --- a/bridges/DauphineLibereBridge.php +++ b/bridges/DauphineLibereBridge.php @@ -43,9 +43,8 @@ class DauphineLibereBridge extends FeedExpander $this->collectExpandableDatas($url, 10); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); $item['content'] = $this->extractContent($item['uri']); return $item; } diff --git a/bridges/DeutscheWelleBridge.php b/bridges/DeutscheWelleBridge.php index 2e10d670..29b478b9 100644 --- a/bridges/DeutscheWelleBridge.php +++ b/bridges/DeutscheWelleBridge.php @@ -71,10 +71,8 @@ class DeutscheWelleBridge extends FeedExpander $this->collectExpandableDatas($this->getInput('feed')); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $parsedUrl = parse_url($item['uri']); unset($parsedUrl['query']); $url = $this->unparseUrl($parsedUrl); diff --git a/bridges/DeveloppezDotComBridge.php b/bridges/DeveloppezDotComBridge.php index 9dcbc31a..d9583fed 100644 --- a/bridges/DeveloppezDotComBridge.php +++ b/bridges/DeveloppezDotComBridge.php @@ -176,10 +176,8 @@ class DeveloppezDotComBridge extends FeedExpander * Parse the content of every RSS item. And will try to get the full article * pointed by the item URL intead of the default abstract. */ - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - if (count($this->items) >= $this->getInput('limit')) { return null; } diff --git a/bridges/EconomistBridge.php b/bridges/EconomistBridge.php index 0572ab8f..aad72275 100644 --- a/bridges/EconomistBridge.php +++ b/bridges/EconomistBridge.php @@ -97,9 +97,8 @@ class EconomistBridge extends FeedExpander $this->collectExpandableDatas($url, $limit); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); $dom = getSimpleHTMLDOM($item['uri']); $article = $dom->find('#new-article-template', 0); diff --git a/bridges/EngadgetBridge.php b/bridges/EngadgetBridge.php index 3253cc2e..b9861b40 100644 --- a/bridges/EngadgetBridge.php +++ b/bridges/EngadgetBridge.php @@ -15,10 +15,8 @@ class EngadgetBridge extends FeedExpander $this->collectExpandableDatas($url, $max); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $itemUrl = trim($item['uri']); if (!$itemUrl) { return $item; diff --git a/bridges/EsquerdaNetBridge.php b/bridges/EsquerdaNetBridge.php index 64a6949f..aa92aa38 100644 --- a/bridges/EsquerdaNetBridge.php +++ b/bridges/EsquerdaNetBridge.php @@ -31,10 +31,8 @@ class EsquerdaNetBridge extends FeedExpander parent::collectExpandableDatas($this->getURI()); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $html = getSimpleHTMLDOMCached($item['uri']); $content = $html->find('div#content div.content', 0); ## Fix author diff --git a/bridges/FeedExpanderTestBridge.php b/bridges/FeedExpanderTestBridge.php index 9a6e7bb7..65b3db89 100644 --- a/bridges/FeedExpanderTestBridge.php +++ b/bridges/FeedExpanderTestBridge.php @@ -14,9 +14,9 @@ class FeedExpanderTestBridge extends FeedExpander public function collectData() { $url = 'http://static.userland.com/gems/backend/sampleRss.xml'; // rss 0.91 - //$url = 'http://feeds.nature.com/nature/rss/current?format=xml'; // rss 1.0 - //$url = 'https://dvikan.no/feed.xml'; // rss 2.0 - //$url = 'https://nedlasting.geonorge.no/geonorge/Tjenestefeed.xml'; // atom + $url = 'http://feeds.nature.com/nature/rss/current?format=xml'; // rss 1.0 + $url = 'https://dvikan.no/feed.xml'; // rss 2.0 + $url = 'https://nedlasting.geonorge.no/geonorge/Tjenestefeed.xml'; // atom $this->collectExpandableDatas($url); } diff --git a/bridges/FilterBridge.php b/bridges/FilterBridge.php index 3e3e812d..1add47f4 100644 --- a/bridges/FilterBridge.php +++ b/bridges/FilterBridge.php @@ -82,10 +82,8 @@ class FilterBridge extends FeedExpander $this->collectExpandableDatas($this->getURI()); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - // Generate title from first 50 characters of content? if ($this->getInput('title_from_content') && array_key_exists('content', $item)) { $content = str_get_html($item['content']); diff --git a/bridges/FolhaDeSaoPauloBridge.php b/bridges/FolhaDeSaoPauloBridge.php index d8d93c4f..dba86c52 100644 --- a/bridges/FolhaDeSaoPauloBridge.php +++ b/bridges/FolhaDeSaoPauloBridge.php @@ -29,10 +29,8 @@ class FolhaDeSaoPauloBridge extends FeedExpander ] ]; - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - if ($this->getInput('deep_crawl')) { $articleHTMLContent = getSimpleHTMLDOMCached($item['uri']); if ($articleHTMLContent) { diff --git a/bridges/ForGifsBridge.php b/bridges/ForGifsBridge.php index e210124a..0a054930 100644 --- a/bridges/ForGifsBridge.php +++ b/bridges/ForGifsBridge.php @@ -12,10 +12,8 @@ class ForGifsBridge extends FeedExpander $this->collectExpandableDatas('https://forgifs.com/gallery/srss/7'); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $dom = str_get_html($item['content']); $img = $dom->find('img', 0); $poster = $img->src; diff --git a/bridges/FreeCodeCampBridge.php b/bridges/FreeCodeCampBridge.php index 141746d2..aaeaf7bd 100644 --- a/bridges/FreeCodeCampBridge.php +++ b/bridges/FreeCodeCampBridge.php @@ -14,10 +14,8 @@ class FreeCodeCampBridge extends FeedExpander $this->collectExpandableDatas('https://www.freecodecamp.org/news/rss/', 15); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $dom = getSimpleHTMLDOM($item['uri']); // figure contain's the main article image diff --git a/bridges/FuturaSciencesBridge.php b/bridges/FuturaSciencesBridge.php index cfb2d711..6420f319 100644 --- a/bridges/FuturaSciencesBridge.php +++ b/bridges/FuturaSciencesBridge.php @@ -85,10 +85,8 @@ class FuturaSciencesBridge extends FeedExpander $this->collectExpandableDatas($url, 10); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $item['uri'] = str_replace('#xtor%3DRSS-8', '', $item['uri']); $dom = getSimpleHTMLDOMCached($item['uri']); $item['content'] = $this->extractArticleContent($dom); diff --git a/bridges/GizmodoBridge.php b/bridges/GizmodoBridge.php index 8ed30704..52812d33 100644 --- a/bridges/GizmodoBridge.php +++ b/bridges/GizmodoBridge.php @@ -8,10 +8,8 @@ class GizmodoBridge extends FeedExpander const CACHE_TIMEOUT = 1800; // 30min const DESCRIPTION = 'Returns the newest posts from Gizmodo.'; - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $html = getSimpleHTMLDOMCached($item['uri']); $html = defaultLinkTo($html, $this->getURI()); diff --git a/bridges/GolemBridge.php b/bridges/GolemBridge.php index 96fa4506..6699e433 100644 --- a/bridges/GolemBridge.php +++ b/bridges/GolemBridge.php @@ -63,9 +63,8 @@ class GolemBridge extends FeedExpander ); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); $item['content'] ??= ''; $uri = $item['uri']; diff --git a/bridges/HardwareInfoBridge.php b/bridges/HardwareInfoBridge.php index 6a47df66..5970ecd0 100644 --- a/bridges/HardwareInfoBridge.php +++ b/bridges/HardwareInfoBridge.php @@ -12,10 +12,8 @@ class HardwareInfoBridge extends FeedExpander $this->collectExpandableDatas('https://nl.hardware.info/updates/all.rss', 10); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $itemUrl = $item['uri']; $articlePage = getSimpleHTMLDOMCached($itemUrl); diff --git a/bridges/HeiseBridge.php b/bridges/HeiseBridge.php index 434e7514..f89594ee 100644 --- a/bridges/HeiseBridge.php +++ b/bridges/HeiseBridge.php @@ -125,10 +125,8 @@ class HeiseBridge extends FeedExpander ); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $sessioncookie = $this->getInput('sessioncookie'); // strip rss parameter diff --git a/bridges/IGNBridge.php b/bridges/IGNBridge.php index c0260cbd..e063dbbb 100644 --- a/bridges/IGNBridge.php +++ b/bridges/IGNBridge.php @@ -15,10 +15,8 @@ class IGNBridge extends FeedExpander // IGNs feed is both hidden and incomplete. This bridge tries to fix this. - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $articlePage = getSimpleHTMLDOM($item['uri']); // List of BS elements diff --git a/bridges/KoreusBridge.php b/bridges/KoreusBridge.php index 874c2c92..5ef5e11f 100644 --- a/bridges/KoreusBridge.php +++ b/bridges/KoreusBridge.php @@ -7,10 +7,8 @@ class KoreusBridge extends FeedExpander const URI = 'https://www.koreus.com/'; const DESCRIPTION = 'Returns the newest posts from Koreus (full text)'; - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $html = getSimpleHTMLDOMCached($item['uri']); $text = $html->find('p.itemText', 0)->innertext; $item['content'] = utf8_encode($text); diff --git a/bridges/LeMondeInformatiqueBridge.php b/bridges/LeMondeInformatiqueBridge.php index c91a0437..860c0887 100644 --- a/bridges/LeMondeInformatiqueBridge.php +++ b/bridges/LeMondeInformatiqueBridge.php @@ -12,10 +12,8 @@ class LeMondeInformatiqueBridge extends FeedExpander $this->collectExpandableDatas(self::URI . 'rss/rss.xml', 10); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $article_html = getSimpleHTMLDOMCached($item['uri']); //Deduce thumbnail URL from article image URL diff --git a/bridges/ListverseBridge.php b/bridges/ListverseBridge.php index b7acbdd0..bffa4cda 100644 --- a/bridges/ListverseBridge.php +++ b/bridges/ListverseBridge.php @@ -13,9 +13,8 @@ class ListverseBridge extends FeedExpander $this->collectExpandableDatas('https://listverse.com/feed/', 15); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); $dom = getSimpleHTMLDOM($item['uri']); $article = $dom->find('#articlecontentonly', 0); $item['content'] = $article; diff --git a/bridges/MediapartBridge.php b/bridges/MediapartBridge.php index c4deda61..aa10e159 100644 --- a/bridges/MediapartBridge.php +++ b/bridges/MediapartBridge.php @@ -29,10 +29,8 @@ class MediapartBridge extends FeedExpander $this->collectExpandableDatas($url); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $itemUrl = $item['uri']; // Mediapart provide multiple type of contents. diff --git a/bridges/MsnMondeBridge.php b/bridges/MsnMondeBridge.php index 9b308b99..a2592702 100644 --- a/bridges/MsnMondeBridge.php +++ b/bridges/MsnMondeBridge.php @@ -25,10 +25,8 @@ class MsnMondeBridge extends FeedExpander $this->collectExpandableDatas(self::FEED_URL, 10); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - if (!preg_match('#fr-fr/actualite.*/ar-(?<id>[\w]*)\?#', $item['uri'], $matches)) { return null; } diff --git a/bridges/NYTBridge.php b/bridges/NYTBridge.php index 57c3e2af..a9942e2d 100644 --- a/bridges/NYTBridge.php +++ b/bridges/NYTBridge.php @@ -14,10 +14,8 @@ class NYTBridge extends FeedExpander $this->collectExpandableDatas($url, 40); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $article = ''; try { diff --git a/bridges/NextInpactBridge.php b/bridges/NextInpactBridge.php index 0260da14..6982c104 100644 --- a/bridges/NextInpactBridge.php +++ b/bridges/NextInpactBridge.php @@ -88,10 +88,8 @@ class NextInpactBridge extends FeedExpander $this->collectExpandableDatas($url, $limit); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $item['content'] = $this->extractContent($item, $item['uri']); if (is_null($item['content'])) { return null; //Filtered article diff --git a/bridges/NextgovBridge.php b/bridges/NextgovBridge.php index bc92d306..7fe7130a 100644 --- a/bridges/NextgovBridge.php +++ b/bridges/NextgovBridge.php @@ -31,10 +31,8 @@ class NextgovBridge extends FeedExpander $this->collectExpandableDatas($url, $limit); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $article_thumbnail = 'https://cdn.nextgov.com/nextgov/images/logo.png'; $item['content'] = '<p><b>' . $item['content'] . '</b></p>'; diff --git a/bridges/NiceMatinBridge.php b/bridges/NiceMatinBridge.php index bcebbbbb..dd90dbfe 100644 --- a/bridges/NiceMatinBridge.php +++ b/bridges/NiceMatinBridge.php @@ -12,10 +12,8 @@ class NiceMatinBridge extends FeedExpander $this->collectExpandableDatas(self::URI . 'derniere-minute/rss', 10); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $item['content'] = $this->extractContent($item['uri']); return $item; } diff --git a/bridges/OnVaSortirBridge.php b/bridges/OnVaSortirBridge.php index 9f9a750c..f8c395c1 100644 --- a/bridges/OnVaSortirBridge.php +++ b/bridges/OnVaSortirBridge.php @@ -123,9 +123,8 @@ class OnVaSortirBridge extends FeedExpander $this->collectExpandableDatas($url); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); $dom = getSimpleHTMLDOMCached($item['uri']); $text = $dom->find('div.corpsMax', 0)->innertext; $item['content'] = utf8_encode($text); diff --git a/bridges/PhoronixBridge.php b/bridges/PhoronixBridge.php index fc0d78e5..227685e0 100644 --- a/bridges/PhoronixBridge.php +++ b/bridges/PhoronixBridge.php @@ -29,10 +29,8 @@ but some RSS readers don\'t support this. "img" tag are supported by most browse $this->collectExpandableDatas('https://www.phoronix.com/rss.php', $this->getInput('n')); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $itemUrl = $item['uri']; $articlePage = getSimpleHTMLDOM($itemUrl); diff --git a/bridges/QwantzBridge.php b/bridges/QwantzBridge.php index 2117c33c..b975bd43 100644 --- a/bridges/QwantzBridge.php +++ b/bridges/QwantzBridge.php @@ -11,14 +11,12 @@ class QwantzBridge extends FeedExpander $this->collectExpandableDatas(self::URI . 'rssfeed.php'); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $item['author'] = 'Ryan North'; preg_match('/title="(.*?)"/', $item['content'], $matches); - $title = $matches[1]; + $title = $matches[1] ?? ''; $content = str_get_html(html_entity_decode($item['content'])); $comicURL = $content->find('img')[0]->{'src'}; diff --git a/bridges/RaceDepartmentBridge.php b/bridges/RaceDepartmentBridge.php index 3783b53e..7390761f 100644 --- a/bridges/RaceDepartmentBridge.php +++ b/bridges/RaceDepartmentBridge.php @@ -12,10 +12,8 @@ class RaceDepartmentBridge extends FeedExpander $this->collectExpandableDatas('https://www.racedepartment.com/ams/index.rss', 10); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $articlePage = getSimpleHTMLDOMCached($item['uri']); $coverImage = $articlePage->find('img.js-articleCoverImage', 0); diff --git a/bridges/ScribbleHubBridge.php b/bridges/ScribbleHubBridge.php index 8f52d461..e7cdf337 100644 --- a/bridges/ScribbleHubBridge.php +++ b/bridges/ScribbleHubBridge.php @@ -42,10 +42,8 @@ class ScribbleHubBridge extends FeedExpander $this->collectExpandableDatas($url); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - //For series, filter out other series from 'All' feed if ( $this->queriedContext === 'Series' diff --git a/bridges/SplCenterBridge.php b/bridges/SplCenterBridge.php index ca764846..af25ec48 100644 --- a/bridges/SplCenterBridge.php +++ b/bridges/SplCenterBridge.php @@ -27,10 +27,8 @@ class SplCenterBridge extends FeedExpander $this->collectExpandableDatas($url); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $articleHtml = getSimpleHTMLDOMCached($item['uri']); foreach ($articleHtml->find('.file') as $index => $media) { diff --git a/bridges/TapasBridge.php b/bridges/TapasBridge.php index ddfbfb92..19995a23 100644 --- a/bridges/TapasBridge.php +++ b/bridges/TapasBridge.php @@ -43,10 +43,8 @@ class TapasBridge extends FeedExpander $this->collectExpandableDatas($this->getURI()); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - // $namespaces = $feedItem->getNamespaces(true); // if (isset($namespaces['content'])) { // $description = $feedItem->children($namespaces['content']); diff --git a/bridges/TheGuardianBridge.php b/bridges/TheGuardianBridge.php index 98e56506..ac8a9661 100644 --- a/bridges/TheGuardianBridge.php +++ b/bridges/TheGuardianBridge.php @@ -56,10 +56,8 @@ class TheGuardianBridge extends FeedExpander $this->collectExpandableDatas($url, 10); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $articlePage = getSimpleHTMLDOM($item['uri']); // figure contain's the main article image $article = $articlePage->find('figure', 0); diff --git a/bridges/TwitterEngineeringBridge.php b/bridges/TwitterEngineeringBridge.php index b98cfb87..96319c97 100644 --- a/bridges/TwitterEngineeringBridge.php +++ b/bridges/TwitterEngineeringBridge.php @@ -14,10 +14,8 @@ class TwitterEngineeringBridge extends FeedExpander $this->collectExpandableDatas($url); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $dom = getSimpleHTMLDOMCached($item['uri']); if (!$dom) { $item['content'] .= '<p><em>Could not request ' . $this->getName() . ': ' . $item['uri'] . '</em></p>'; diff --git a/bridges/VarietyBridge.php b/bridges/VarietyBridge.php index 6625dca2..a49ea353 100644 --- a/bridges/VarietyBridge.php +++ b/bridges/VarietyBridge.php @@ -13,10 +13,8 @@ class VarietyBridge extends FeedExpander $this->collectExpandableDatas('https://feeds.feedburner.com/variety/headlines', 15); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - // $articlePage gets the entire page's contents $articlePage = getSimpleHTMLDOM($item['uri']); // Remove Script tags diff --git a/bridges/ViceBridge.php b/bridges/ViceBridge.php index c7ecec33..dd81c559 100644 --- a/bridges/ViceBridge.php +++ b/bridges/ViceBridge.php @@ -32,10 +32,8 @@ class ViceBridge extends FeedExpander $this->collectExpandableDatas($feedURL, 10); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $articlePage = getSimpleHTMLDOM($item['uri']); // text and embedded content $article = $articlePage->find('.article__body', 0); diff --git a/bridges/WeLiveSecurityBridge.php b/bridges/WeLiveSecurityBridge.php index f54f6b29..151484c4 100644 --- a/bridges/WeLiveSecurityBridge.php +++ b/bridges/WeLiveSecurityBridge.php @@ -12,10 +12,8 @@ class WeLiveSecurityBridge extends FeedExpander ], ]; - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $html = getSimpleHTMLDOMCached($item['uri']); if (!$html) { $item['content'] .= '<br /><p><em>Could not request ' . $this->getName() . ': ' . $item['uri'] . '</em></p>'; diff --git a/bridges/WiredBridge.php b/bridges/WiredBridge.php index 7f7f6051..f7da288c 100644 --- a/bridges/WiredBridge.php +++ b/bridges/WiredBridge.php @@ -50,10 +50,8 @@ class WiredBridge extends FeedExpander $this->collectExpandableDatas($feed_url, $limit); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $originalContent = $item['content']; $article = getSimpleHTMLDOMCached($item['uri']); diff --git a/bridges/WordPressBridge.php b/bridges/WordPressBridge.php index fb27eb31..84a6b8ab 100644 --- a/bridges/WordPressBridge.php +++ b/bridges/WordPressBridge.php @@ -34,10 +34,8 @@ class WordPressBridge extends FeedExpander } } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $dom = getSimpleHTMLDOMCached($item['uri']); // Find article body diff --git a/bridges/WorldOfTanksBridge.php b/bridges/WorldOfTanksBridge.php index 52691025..7bf20015 100644 --- a/bridges/WorldOfTanksBridge.php +++ b/bridges/WorldOfTanksBridge.php @@ -30,9 +30,8 @@ class WorldOfTanksBridge extends FeedExpander $this->collectExpandableDatas(sprintf('https://worldoftanks.eu/%s/rss/news/', $this->getInput('lang'))); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); $item['content'] = $this->loadFullArticle($item['uri']); return $item; } diff --git a/bridges/ZDNetBridge.php b/bridges/ZDNetBridge.php index 00b272ce..e3b659a8 100644 --- a/bridges/ZDNetBridge.php +++ b/bridges/ZDNetBridge.php @@ -174,10 +174,8 @@ class ZDNetBridge extends FeedExpander $this->collectExpandableDatas($url, $limit); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $article = getSimpleHTMLDOMCached($item['uri']); if (!$article) { $this->logger->info('Unable to parse the dom from ' . $item['uri']); diff --git a/bridges/ZeitBridge.php b/bridges/ZeitBridge.php index 7d7e89aa..0ed9276b 100644 --- a/bridges/ZeitBridge.php +++ b/bridges/ZeitBridge.php @@ -59,10 +59,8 @@ class ZeitBridge extends FeedExpander $this->collectExpandableDatas($url, $limit); } - protected function parseItem($item) + protected function parseItem(array $item) { - $item = parent::parseItem($item); - $item['enclosures'] = []; $headers = [ diff --git a/docs/05_Bridge_API/03_FeedExpander.md b/docs/05_Bridge_API/03_FeedExpander.md index 910d1abb..62356fc9 100644 --- a/docs/05_Bridge_API/03_FeedExpander.md +++ b/docs/05_Bridge_API/03_FeedExpander.md @@ -1,85 +1,35 @@ -`FeedExpander` extends [`BridgeAbstract`](./02_BridgeAbstract.md) and adds functions to collect data from existing feeds. - **Usage example**: _You have discovered a site that provides feeds which are hidden and inaccessible by normal means. You want your bridge to directly read the feeds and provide them via **RSS-Bridge**_ -To create a new Bridge extending `FeedExpander` you must implement all required functions of [`BridgeAbstract`](./02_BridgeAbstract.md). `FeedExpander` additionally provides following functions: - -* [`parseItem`](#the-parseitem-function) -* [`getName`](#the-getname-function) -* [`getURI`](#the-geturi-function) -* [`getDescription`](#the-getdescription-function) - Find a [template](#template) at the end of this file. **Notice:** For a standard feed only `collectData` need to be implemented. `collectData` should call `$this->collectExpandableDatas('your URI here');` to automatically load feed items and header data (will subsequently call `parseItem` for each item in the feed). You can limit the number of items to fetch by specifying an additional parameter for: `$this->collectExpandableDatas('your URI here', 10)` (limited to 10 items). -## The `parseItem` function +## The `parseItem` method -This function receives one item from the current feed and should return one **RSS-Bridge** item. +This method receives one item from the current feed and should return one **RSS-Bridge** item. The default function does all the work to get the item data from the feed, whether it is RSS 1.0, -RSS 2.0 or Atom 1.0. If you have to redefine this function in your **RSS-Bridge** for whatever reason, -you should first call the parent function to initialize the item, then apply the changes that you require. +RSS 2.0 or Atom 1.0. **Notice:** The following code sample is just an example. Implementation depends on your requirements! ```PHP -protected function parseItem($feedItem){ - $item = parent::parseItem($feedItem); - $item['content'] = str_replace('rssbridge','RSS-Bridge',$feedItem->content); - +protected function parseItem(array $item) +{ + $item['content'] = str_replace('rssbridge','RSS-Bridge',$item['content']); return $item; } ``` -### Helper functions +### Feed parsing -The `FeedExpander` already provides a set of functions to parse RSS or Atom items based on the specifications. Where possible make use of these functions: - -Function | Description ----------|------------ -`parseATOMItem` | Parses an Atom 1.0 feed item -`parseRSS_0_9_1_Item` | Parses an RSS 0.91 feed item -`parseRSS_1_0_Item` | Parses an RSS 1.0 feed item -`parseRSS_2_0_Item` | Parses an RSS 2.0 feed item - -In the following list you'll find the feed tags assigned to the the **RSS-Bridge** item keys: +How rss-bridge processes xml feeds: Function | uri | title | timestamp | author | content ---------|-----|-------|-----------|--------|-------- -`parseATOMItem` | id | title | updated | author | content -`parseRSS_0_9_1_Item` | link | title | | | description -`parseRSS_1_0_Item` | link | title | dc:date | dc:creator | description -`parseRSS_2_0_Item` | link, guid | title | pubDate, dc:date | author, dc:creator | description - -## The `getName` function - -Returns the name of the current feed. - -```PHP -return $this->name; -``` - -**Notice:** Only implement this function if you require different behavior! - -## The `getURI` function - -Return the uri for the current feed. - -```PHP -return $this->uri; -``` - -**Notice:** Only implement this function if you require different behavior! - -## The `getDescription` function - -Returns the description for the current bridge. - -```PHP -return $this->description; -``` - -**Notice:** Only implement this function if you require different behavior! +`atom` | id | title | updated | author | content +`rss 0.91` | link | title | | | description +`rss 1.0` | link | title | dc:date | dc:creator | description +`rss 2.0` | link, guid | title | pubDate, dc:date | author, dc:creator | description # Template @@ -87,19 +37,19 @@ This is the template for a new bridge: ```PHP <?php -class MySiteBridge extends FeedExpander { +class MySiteBridge extends FeedExpander +{ - const MAINTAINER = 'No maintainer'; - const NAME = 'Unnamed bridge'; - const URI = ''; - const DESCRIPTION = 'No description provided'; - const PARAMETERS = []; - const CACHE_TIMEOUT = 3600; + const MAINTAINER = 'No maintainer'; + const NAME = 'Unnamed bridge'; + const URI = ''; + const DESCRIPTION = 'No description provided'; + const PARAMETERS = []; + const CACHE_TIMEOUT = 3600; public function collectData() { $this->collectExpandableDatas('your feed URI'); } } -// Imaginary empty line! ``` \ No newline at end of file diff --git a/docs/05_Bridge_API/index.md b/docs/05_Bridge_API/index.md index e49e47be..06445246 100644 --- a/docs/05_Bridge_API/index.md +++ b/docs/05_Bridge_API/index.md @@ -7,7 +7,7 @@ and extends one of the base classes of **RSS-Bridge**: Base class | Description -----------|------------ [`BridgeAbstract`](./02_BridgeAbstract.md) | This class is intended for standard _Bridges_ that need to filter HTML pages for content. -[`FeedExpander`](./03_FeedExpander.md) | This class is an extension of `HttpCachingBridgeAbstract`, designed to load existing feeds into **RSS-Bridge** +[`FeedExpander`](./03_FeedExpander.md) | Expand/modify existing feed urls [`XPathAbstract`](./04_XPathAbstract.md) | This class is meant as an alternative base class for bridge implementations. It offers preliminary functionality for generating feeds based on _XPath expressions_. For more information about how to create a new _Bridge_, read [How to create a new Bridge?](./01_How_to_create_a_new_bridge.md) \ No newline at end of file diff --git a/lib/FeedExpander.php b/lib/FeedExpander.php index 70c4560d..f9cff900 100644 --- a/lib/FeedExpander.php +++ b/lib/FeedExpander.php @@ -5,111 +5,57 @@ */ abstract class FeedExpander extends BridgeAbstract { - const FEED_TYPE_RSS_1_0 = 'RSS_1_0'; - const FEED_TYPE_RSS_2_0 = 'RSS_2_0'; - const FEED_TYPE_ATOM_1_0 = 'ATOM_1_0'; - - private string $feedType; - private FeedParser $feedParser; - private array $parsedFeed; - - public function __construct(CacheInterface $cache, Logger $logger) - { - parent::__construct($cache, $logger); - $this->feedParser = new FeedParser(); - } + private array $feed; public function collectExpandableDatas(string $url, $maxItems = -1) { if (!$url) { throw new \Exception('There is no $url for this RSS expander'); } + $maxItems = (int) $maxItems; if ($maxItems === -1) { $maxItems = 999; } $accept = [MrssFormat::MIME_TYPE, AtomFormat::MIME_TYPE, '*/*']; $httpHeaders = ['Accept: ' . implode(', ', $accept)]; - // Notice we do not use cache here on purpose. We want a fresh view of the RSS stream each time $xmlString = getContents($url, $httpHeaders); if ($xmlString === '') { throw new \Exception(sprintf('Unable to parse xml from `%s` because we got the empty string', $url), 10); } - // Maybe move this call earlier up the stack frames - // Disable triggering of the php error-handler and handle errors manually instead - libxml_use_internal_errors(true); - // Consider replacing libxml with https://www.php.net/domdocument - // Intentionally not using the silencing operator (@) because it has no effect here - $xml = simplexml_load_string(trim($xmlString)); - if ($xml === false) { - $xmlErrors = libxml_get_errors(); - foreach ($xmlErrors as $xmlError) { - Debug::log(trim($xmlError->message)); - } - if ($xmlErrors) { - // Render only the first error into exception message - $firstXmlErrorMessage = $xmlErrors[0]->message; - } - throw new \Exception(sprintf('Unable to parse xml from `%s` %s', $url, $firstXmlErrorMessage ?? ''), 11); - } - // Restore previous behaviour in case other code relies on it being off - libxml_use_internal_errors(false); - - // Currently only feed metadata (not items) are plucked out - $this->parsedFeed = $this->feedParser->parseFeed($xmlString); - - if (isset($xml->item[0])) { - $this->feedType = self::FEED_TYPE_RSS_1_0; - $items = $xml->item; - } elseif (isset($xml->channel[0])) { - $this->feedType = self::FEED_TYPE_RSS_2_0; - $items = $xml->channel[0]->item; - } elseif (isset($xml->entry[0])) { - $this->feedType = self::FEED_TYPE_ATOM_1_0; - $items = $xml->entry; - } else { - throw new \Exception(sprintf('Unable to detect feed format from `%s`', $url)); - } + $feedParser = new FeedParser(); + $this->feed = $feedParser->parseFeed($xmlString); + $items = array_slice($this->feed['items'], 0, $maxItems); foreach ($items as $item) { - $parsedItem = $this->parseItem($item); - if ($parsedItem) { - $this->items[] = $parsedItem; - } - if (count($this->items) >= $maxItems) { - break; + // Give bridges a chance to modify the item + $item = $this->parseItem($item); + if ($item) { + $this->items[] = $item; } } - return $this; } /** - * @param \SimpleXMLElement $item The feed item to be parsed + * This method is overidden by bridges + * + * @return array */ - protected function parseItem($item) + protected function parseItem(array $item) { - switch ($this->feedType) { - case self::FEED_TYPE_RSS_1_0: - return $this->feedParser->parseRss1Item($item); - case self::FEED_TYPE_RSS_2_0: - return $this->feedParser->parseRss2Item($item); - case self::FEED_TYPE_ATOM_1_0: - return $this->feedParser->parseAtomItem($item); - default: - throw new \Exception(sprintf('Unknown version %s!', $this->getInput('version'))); - } + return $item; } public function getURI() { - return $this->parsedFeed['uri'] ?? parent::getURI(); + return $this->feed['uri'] ?? parent::getURI(); } public function getName() { - return $this->parsedFeed['title'] ?? parent::getName(); + return $this->feed['title'] ?? parent::getName(); } public function getIcon() { - return $this->parsedFeed['icon'] ?? parent::getIcon(); + return $this->feed['icon'] ?? parent::getIcon(); } } diff --git a/lib/FeedParser.php b/lib/FeedParser.php index 04452e7d..0a5b4679 100644 --- a/lib/FeedParser.php +++ b/lib/FeedParser.php @@ -14,10 +14,10 @@ final class FeedParser throw new \Exception('Unable to parse xml'); } $feed = [ - 'title' => null, - 'url' => null, - 'icon' => null, - 'items' => [], + 'title' => null, + 'uri' => null, + 'icon' => null, + 'items' => [], ]; if (isset($xml->item[0])) { // rss 1.0