From f27b267614a8146adfee30d7003f7181ec262a9c Mon Sep 17 00:00:00 2001 From: Anchit Bajaj Date: Wed, 28 Aug 2019 19:57:45 +0530 Subject: [PATCH] [GuardianBridge] - New bridge for the Guardian (#1249) * [GuardianBridge] - New bridge for the Guardian --- bridges/TheGuardianBridge.php | 96 +++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 bridges/TheGuardianBridge.php diff --git a/bridges/TheGuardianBridge.php b/bridges/TheGuardianBridge.php new file mode 100644 index 00000000..e655f0ef --- /dev/null +++ b/bridges/TheGuardianBridge.php @@ -0,0 +1,96 @@ + array( + 'name' => 'Feed', + 'type' => 'list', + 'values' => array( + 'World News' => 'world/rss', + 'US News' => '/us-news/rss', + 'UK News' => '/uk-news/rss', + 'Europe News' => '/world/europe-news/rss', + 'Asia News' => '/world/asia/rss', + 'Tech' => '/uk/technology/rss', + 'Business News' => '/uk/business/rss', + 'Opinion' => '/uk/commentisfree/rss', + 'Lifestyle' => '/uk/lifeandstyle/rss', + 'Culture' => '/uk/culture/rss', + 'Sports' => '/uk/sport/rss' + ) + ) + + /* + + Topicwise Links + + You can find the base feed for any topic by appending /rss to the url. + + Example: + + https://feeds.theguardian.com/theguardian/uk-news/rss + https://feeds.theguardian.com/theguardian/us-news/rss + + Or simply + + https://www.theguardian.com/world/rss + + Just add that topic as a value in the PARAMETERS const. + + */ + + + )); + + public function collectData(){ + $feed = $this->getInput('feed'); + $feedURL = 'https://feeds.theguardian.com/theguardian/' . $feed; + $this->collectExpandableDatas($feedURL, 10); + } + + protected function parseItem($newsItem){ + $item = parent::parseItem($newsItem); + + // --- Recovering the article --- + + // $articlePage gets the entire page's contents + $articlePage = getSimpleHTMLDOM($newsItem->link); + // figure contain's the main article image + $article = $articlePage->find('figure', 0); + // content__article-body has the actual article + foreach($articlePage->find('.content__article-body') as $element) + $article = $article . $element; + + // --- Fixing ugly elements --- + + // Replace the image viewer and BS with the image itself + foreach($articlePage->find('a.article__img-container') as $uslElementLoc) { + $main_img = $uslElementLoc->find('img', 0); + $article = str_replace($uslElementLoc, $main_img, $article); + } + + // List of all the crap in the article + $uselessElements = array( + '#show-caption', + '.element-atom', + '.submeta', + 'youtube-media-atom', + 'svg' + ); + + // Remove the listed crap + foreach($uselessElements as $uslElement) { + foreach($articlePage->find($uslElement) as $uslElementLoc) { + $article = str_replace($uslElementLoc, '', $article); + } + } + + $item['content'] = $article; + + return $item; + } +}