refactor: FeedExpander::parseItem() descendants (#3744)

This commit is contained in:
Dag 2023-10-13 00:25:34 +02:00 committed by GitHub
parent 9bda9e246a
commit 382648fc22
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
46 changed files with 314 additions and 281 deletions

View file

@ -20,17 +20,16 @@ class AcrimedBridge extends FeedExpander
public function collectData() public function collectData()
{ {
$this->collectExpandableDatas( $url = 'https://www.acrimed.org/spip.php?page=backend';
static::URI . 'spip.php?page=backend', $limit = $this->getInput('limit');
$this->getInput('limit') $this->collectExpandableDatas($url, $limit);
);
} }
protected function parseItem($newsItem) protected function parseItem($item)
{ {
$item = parent::parseItem($newsItem); $item = parent::parseItem($item);
$articlePage = getSimpleHTMLDOM($newsItem->link); $articlePage = getSimpleHTMLDOM($item['uri']);
$article = sanitize($articlePage->find('article.article1', 0)->innertext); $article = sanitize($articlePage->find('article.article1', 0)->innertext);
$article = defaultLinkTo($article, static::URI); $article = defaultLinkTo($article, static::URI);
$item['content'] = $article; $item['content'] = $article;

View file

@ -33,9 +33,9 @@ class ArsTechnicaBridge extends FeedExpander
$this->collectExpandableDatas($url); $this->collectExpandableDatas($url);
} }
protected function parseItem($newItem) protected function parseItem($item)
{ {
$item = parent::parseItem($newItem); $item = parent::parseItem($item);
$item_html = getSimpleHTMLDOMCached($item['uri'] . '&amp'); $item_html = getSimpleHTMLDOMCached($item['uri'] . '&amp');
$item_html = defaultLinkTo($item_html, self::URI); $item_html = defaultLinkTo($item_html, self::URI);

View file

@ -7,6 +7,12 @@ class BleepingComputerBridge extends FeedExpander
const URI = 'https://www.bleepingcomputer.com/'; const URI = 'https://www.bleepingcomputer.com/';
const DESCRIPTION = 'Returns the newest articles.'; const DESCRIPTION = 'Returns the newest articles.';
public function collectData()
{
$feed = static::URI . 'feed/';
$this->collectExpandableDatas($feed);
}
protected function parseItem($item) protected function parseItem($item)
{ {
$item = parent::parseItem($item); $item = parent::parseItem($item);
@ -23,10 +29,4 @@ class BleepingComputerBridge extends FeedExpander
return $item; return $item;
} }
public function collectData()
{
$feed = static::URI . 'feed/';
$this->collectExpandableDatas($feed);
}
} }

View file

@ -43,9 +43,9 @@ class CNETFranceBridge extends FeedExpander
$this->collectExpandableDatas('https://www.cnetfrance.fr/feeds/rss/news/'); $this->collectExpandableDatas('https://www.cnetfrance.fr/feeds/rss/news/');
} }
protected function parseItem($feedItem) protected function parseItem($item)
{ {
$item = parent::parseItem($feedItem); $item = parent::parseItem($item);
foreach ($this->bannedTitle as $term) { foreach ($this->bannedTitle as $term) {
if (preg_match('/' . $term . '/mi', $item['title']) === 1) { if (preg_match('/' . $term . '/mi', $item['title']) === 1) {
@ -54,8 +54,7 @@ class CNETFranceBridge extends FeedExpander
} }
foreach ($this->bannedURL as $term) { foreach ($this->bannedURL as $term) {
$preg_match = preg_match('#' . $term . '#mi', $item['uri']); if (preg_match('#' . $term . '#mi', $item['uri'])) {
if ($preg_match === 1) {
return null; return null;
} }
} }

View file

@ -34,9 +34,9 @@ class CaschyBridge extends FeedExpander
); );
} }
protected function parseItem($feedItem) protected function parseItem($item)
{ {
$item = parent::parseItem($feedItem); $item = parent::parseItem($item);
if (strpos($item['uri'], 'https://stadt-bremerhaven.de/') !== 0) { if (strpos($item['uri'], 'https://stadt-bremerhaven.de/') !== 0) {
return $item; return $item;

View file

@ -12,9 +12,9 @@ class CommonDreamsBridge extends FeedExpander
$this->collectExpandableDatas('http://www.commondreams.org/rss.xml', 10); $this->collectExpandableDatas('http://www.commondreams.org/rss.xml', 10);
} }
protected function parseItem($newsItem) protected function parseItem($item)
{ {
$item = parent::parseItem($newsItem); $item = parent::parseItem($item);
$item['content'] = $this->extractContent($item['uri']); $item['content'] = $this->extractContent($item['uri']);
return $item; return $item;
} }

View file

@ -13,11 +13,11 @@ class CourrierInternationalBridge extends FeedExpander
$this->collectExpandableDatas(static::URI . 'feed/all/rss.xml', 20); $this->collectExpandableDatas(static::URI . 'feed/all/rss.xml', 20);
} }
protected function parseItem($feedItem) protected function parseItem($item)
{ {
$item = parent::parseItem($feedItem); $item = parent::parseItem($item);
$articlePage = getSimpleHTMLDOMCached($feedItem->link); $articlePage = getSimpleHTMLDOMCached($item['uri']);
$content = $articlePage->find('.article-text, depeche-text', 0); $content = $articlePage->find('.article-text, depeche-text', 0);
if (!$content) { if (!$content) {
return $item; return $item;

View file

@ -56,9 +56,10 @@ class DarkReadingBridge extends FeedExpander
$this->collectExpandableDatas($feed_url, $limit); $this->collectExpandableDatas($feed_url, $limit);
} }
protected function parseItem($newsItem) protected function parseItem($item)
{ {
$item = parent::parseItem($newsItem); $item = parent::parseItem($item);
$article = getSimpleHTMLDOMCached($item['uri']); $article = getSimpleHTMLDOMCached($item['uri']);
$item['content'] = $this->extractArticleContent($article); $item['content'] = $this->extractArticleContent($article);
$item['enclosures'] = []; //remove author profile picture $item['enclosures'] = []; //remove author profile picture

View file

@ -43,9 +43,9 @@ class DauphineLibereBridge extends FeedExpander
$this->collectExpandableDatas($url, 10); $this->collectExpandableDatas($url, 10);
} }
protected function parseItem($newsItem) protected function parseItem($item)
{ {
$item = parent::parseItem($newsItem); $item = parent::parseItem($item);
$item['content'] = $this->extractContent($item['uri']); $item['content'] = $this->extractContent($item['uri']);
return $item; return $item;
} }

View file

@ -163,19 +163,6 @@ class DeveloppezDotComBridge extends FeedExpander
] ]
]; ];
/**
* Return the RSS url for selected domain
*/
private function getRssUrl()
{
$domain = $this->getInput('domain');
if (!empty($domain)) {
return 'https://' . $domain . self::DOMAIN . self::RSS_URL;
}
return self::URI . self::RSS_URL;
}
/** /**
* Grabs the RSS item from Developpez.com * Grabs the RSS item from Developpez.com
*/ */
@ -189,15 +176,14 @@ class DeveloppezDotComBridge extends FeedExpander
* Parse the content of every RSS item. And will try to get the full article * Parse the content of every RSS item. And will try to get the full article
* pointed by the item URL intead of the default abstract. * pointed by the item URL intead of the default abstract.
*/ */
protected function parseItem($newsItem) protected function parseItem($item)
{ {
$item = parent::parseItem($item);
if (count($this->items) >= $this->getInput('limit')) { if (count($this->items) >= $this->getInput('limit')) {
return null; return null;
} }
// This function parse each entry in the RSS with the default parse
$item = parent::parseItem($newsItem);
// There is a bug in Developpez RSS, coma are writtent as '~?' in the // There is a bug in Developpez RSS, coma are writtent as '~?' in the
// title, so I have to fix it manually // title, so I have to fix it manually
$item['title'] = $this->fixComaInTitle($item['title']); $item['title'] = $this->fixComaInTitle($item['title']);
@ -229,6 +215,19 @@ class DeveloppezDotComBridge extends FeedExpander
return $item; return $item;
} }
/**
* Return the RSS url for selected domain
*/
private function getRssUrl()
{
$domain = $this->getInput('domain');
if (!empty($domain)) {
return 'https://' . $domain . self::DOMAIN . self::RSS_URL;
}
return self::URI . self::RSS_URL;
}
/** /**
* Replace '~?' by a proper coma ',' * Replace '~?' by a proper coma ','
*/ */
@ -334,6 +333,9 @@ class DeveloppezDotComBridge extends FeedExpander
*/ */
private function isHtmlTagNotTxt($txt) private function isHtmlTagNotTxt($txt)
{ {
if ($txt === '') {
return false;
}
$html = str_get_html($txt); $html = str_get_html($txt);
return $html && $html->root && count($html->root->children) > 0; return $html && $html->root && count($html->root->children) > 0;
} }

View file

@ -93,21 +93,22 @@ class EconomistBridge extends FeedExpander
$limit = 30; $limit = 30;
} }
$this->collectExpandableDatas('https://www.economist.com/' . $category . '/rss.xml', $limit); $url = 'https://www.economist.com/' . $category . '/rss.xml';
$this->collectExpandableDatas($url, $limit);
} }
protected function parseItem($feedItem) protected function parseItem($item)
{ {
$item = parent::parseItem($feedItem); $item = parent::parseItem($item);
$html = getSimpleHTMLDOM($item['uri']); $dom = getSimpleHTMLDOM($item['uri']);
$article = $html->find('#new-article-template', 0); $article = $dom->find('#new-article-template', 0);
if ($article == null) { if ($article == null) {
$article = $html->find('main', 0); $article = $dom->find('main', 0);
} }
if ($article) { if ($article) {
$elem = $article->find('div', 0); $elem = $article->find('div', 0);
list($content, $audio_url) = $this->processContent($html, $elem); list($content, $audio_url) = $this->processContent($dom, $elem);
$item['content'] = $content; $item['content'] = $content;
if ($audio_url != null) { if ($audio_url != null) {
$item['enclosures'] = [$audio_url]; $item['enclosures'] = [$audio_url];

View file

@ -10,26 +10,28 @@ class EngadgetBridge extends FeedExpander
public function collectData() public function collectData()
{ {
$url = 'https://www.engadget.com/rss.xml';
$max = 10; $max = 10;
$this->collectExpandableDatas(static::URI . 'rss.xml', $max); $this->collectExpandableDatas($url, $max);
} }
protected function parseItem($newsItem) protected function parseItem($item)
{ {
$item = parent::parseItem($newsItem); $item = parent::parseItem($item);
$url = (string) $newsItem->link;
if (!$url) { $itemUrl = trim($item['uri']);
if (!$itemUrl) {
return $item; return $item;
} }
// todo: remove querystring tracking // todo: remove querystring tracking
$articlePage = getSimpleHTMLDOM($url); $dom = getSimpleHTMLDOM($itemUrl);
// figure contain's the main article image // figure contain's the main article image
$article = $articlePage->find('figure', 0); $article = $dom->find('figure', 0);
// .article-text has the actual article // .article-text has the actual article
foreach ($articlePage->find('.article-text') as $element) { foreach ($dom->find('.article-text') as $element) {
$article = $article . $element; $article = $article . $element;
} }
$item['content'] = $article; $item['content'] = $article ?? '';
return $item; return $item;
} }
} }

View file

@ -1,5 +1,8 @@
<?php <?php
/**
* Appears to be protected by cloudflare now
*/
class EsquerdaNetBridge extends FeedExpander class EsquerdaNetBridge extends FeedExpander
{ {
const MAINTAINER = 'somini'; const MAINTAINER = 'somini';
@ -23,32 +26,16 @@ class EsquerdaNetBridge extends FeedExpander
] ]
]; ];
public function getURI()
{
$type = $this->getInput('feed');
return self::URI . '/rss/' . $type;
}
public function getIcon()
{
return 'https://www.esquerda.net/sites/default/files/favicon_0.ico';
}
public function collectData() public function collectData()
{ {
parent::collectExpandableDatas($this->getURI()); parent::collectExpandableDatas($this->getURI());
} }
protected function parseItem($newsItem) protected function parseItem($item)
{ {
# Fix Publish date $item = parent::parseItem($item);
$badDate = $newsItem->pubDate;
preg_match('|(?P<day>\d\d)/(?P<month>\d\d)/(?P<year>\d\d\d\d) - (?P<hour>\d\d):(?P<minute>\d\d)|', $badDate, $d); $html = getSimpleHTMLDOMCached($item['uri']);
$newsItem->pubDate = sprintf('%s-%s-%sT%s:%s', $d['year'], $d['month'], $d['day'], $d['hour'], $d['minute']);
$item = parent::parseItem($newsItem);
# Include all the content
$uri = $item['uri'];
$html = getSimpleHTMLDOMCached($uri);
$content = $html->find('div#content div.content', 0); $content = $html->find('div#content div.content', 0);
## Fix author ## Fix author
$authorHTML = $html->find('.field-name-field-op-author a', 0); $authorHTML = $html->find('.field-name-field-op-author a', 0);
@ -72,4 +59,15 @@ class EsquerdaNetBridge extends FeedExpander
$item['content'] = $content; $item['content'] = $content;
return $item; return $item;
} }
public function getURI()
{
$type = $this->getInput('feed');
return self::URI . '/rss/' . $type;
}
public function getIcon()
{
return 'https://www.esquerda.net/sites/default/files/favicon_0.ico';
}
} }

View file

@ -43,9 +43,4 @@ class FeedExpanderExampleBridge extends FeedExpander
returnClientError('Unknown version ' . $this->getInput('version') . '!'); returnClientError('Unknown version ' . $this->getInput('version') . '!');
} }
} }
protected function parseItem($newsItem)
{
return (array) $newsItem;
}
} }

View file

@ -0,0 +1,23 @@
<?php
declare(strict_types=1);
class FeedExpanderTestBridge extends FeedExpander
{
const MAINTAINER = 'No maintainer';
const NAME = 'Unnamed bridge';
const URI = 'https://esdf.com/';
const DESCRIPTION = 'No description provided';
const PARAMETERS = [];
const CACHE_TIMEOUT = 3600;
public function collectData()
{
$url = 'http://static.userland.com/gems/backend/sampleRss.xml'; // rss 0.91
//$url = 'http://feeds.nature.com/nature/rss/current?format=xml'; // rss 1.0
//$url = 'https://dvikan.no/feed.xml'; // rss 2.0
//$url = 'https://nedlasting.geonorge.no/geonorge/Tjenestefeed.xml'; // atom
$this->collectExpandableDatas($url);
}
}

View file

@ -82,9 +82,9 @@ class FilterBridge extends FeedExpander
$this->collectExpandableDatas($this->getURI()); $this->collectExpandableDatas($this->getURI());
} }
protected function parseItem($newItem) protected function parseItem($item)
{ {
$item = parent::parseItem($newItem); $item = parent::parseItem($item);
// Generate title from first 50 characters of content? // Generate title from first 50 characters of content?
if ($this->getInput('title_from_content') && array_key_exists('content', $item)) { if ($this->getInput('title_from_content') && array_key_exists('content', $item)) {

View file

@ -12,12 +12,12 @@ class ForGifsBridge extends FeedExpander
$this->collectExpandableDatas('https://forgifs.com/gallery/srss/7'); $this->collectExpandableDatas('https://forgifs.com/gallery/srss/7');
} }
protected function parseItem($feedItem) protected function parseItem($item)
{ {
$item = parent::parseItem($feedItem); $item = parent::parseItem($item);
$content = str_get_html($item['content']); $dom = str_get_html($item['content']);
$img = $content->find('img', 0); $img = $dom->find('img', 0);
$poster = $img->src; $poster = $img->src;
// The actual gif is the same path but its id must be decremented by one. // The actual gif is the same path but its id must be decremented by one.
@ -34,7 +34,7 @@ class ForGifsBridge extends FeedExpander
$img->width = 'auto'; $img->width = 'auto';
$img->height = 'auto'; $img->height = 'auto';
$item['content'] = $content; $item['content'] = (string) $dom;
return $item; return $item;
} }

View file

@ -14,15 +14,17 @@ class FreeCodeCampBridge extends FeedExpander
$this->collectExpandableDatas('https://www.freecodecamp.org/news/rss/', 15); $this->collectExpandableDatas('https://www.freecodecamp.org/news/rss/', 15);
} }
protected function parseItem($newsItem) protected function parseItem($item)
{ {
$item = parent::parseItem($newsItem); $item = parent::parseItem($item);
// $articlePage gets the entire page's contents
$articlePage = getSimpleHTMLDOM($newsItem->link); $dom = getSimpleHTMLDOM($item['uri']);
// figure contain's the main article image // figure contain's the main article image
$article = $articlePage->find('figure', 0); $article = $dom->find('figure', 0);
// the actual article // the actual article
foreach ($articlePage->find('.post-full-content') as $element) { foreach ($dom->find('.post-full-content') as $element) {
$article = $article . $element; $article = $article . $element;
} }
$item['content'] = $article; $item['content'] = $article;

View file

@ -85,13 +85,14 @@ class FuturaSciencesBridge extends FeedExpander
$this->collectExpandableDatas($url, 10); $this->collectExpandableDatas($url, 10);
} }
protected function parseItem($newsItem) protected function parseItem($item)
{ {
$item = parent::parseItem($newsItem); $item = parent::parseItem($item);
$item['uri'] = str_replace('#xtor%3DRSS-8', '', $item['uri']); $item['uri'] = str_replace('#xtor%3DRSS-8', '', $item['uri']);
$article = getSimpleHTMLDOMCached($item['uri']); $dom = getSimpleHTMLDOMCached($item['uri']);
$item['content'] = $this->extractArticleContent($article); $item['content'] = $this->extractArticleContent($dom);
$author = $this->extractAuthor($article); $author = $this->extractAuthor($dom);
if (!empty($author)) { if (!empty($author)) {
$item['author'] = $author; $item['author'] = $author;
} }

View file

@ -9,15 +9,15 @@ class HardwareInfoBridge extends FeedExpander
public function collectData() public function collectData()
{ {
$this->collectExpandableDatas('https://nl.hardware.info/updates/all.rss', 20); $this->collectExpandableDatas('https://nl.hardware.info/updates/all.rss', 10);
} }
protected function parseItem($feedItem) protected function parseItem($item)
{ {
$item = parent::parseItem($feedItem); $item = parent::parseItem($item);
//get full article $itemUrl = $item['uri'];
$articlePage = getSimpleHTMLDOMCached($feedItem->link); $articlePage = getSimpleHTMLDOMCached($itemUrl);
$article = $articlePage->find('div.article__content', 0); $article = $articlePage->find('div.article__content', 0);

View file

@ -125,9 +125,10 @@ class HeiseBridge extends FeedExpander
); );
} }
protected function parseItem($feedItem) protected function parseItem($item)
{ {
$item = parent::parseItem($feedItem); $item = parent::parseItem($item);
$sessioncookie = $this->getInput('sessioncookie'); $sessioncookie = $this->getInput('sessioncookie');
// strip rss parameter // strip rss parameter

View file

@ -10,17 +10,16 @@ class IGNBridge extends FeedExpander
public function collectData() public function collectData()
{ {
$this->collectExpandableDatas('http://feeds.ign.com/ign/all', 15); $this->collectExpandableDatas('http://feeds.ign.com/ign/all', 2);
} }
// IGNs feed is both hidden and incomplete. This bridge tries to fix this. // IGNs feed is both hidden and incomplete. This bridge tries to fix this.
protected function parseItem($newsItem) protected function parseItem($item)
{ {
$item = parent::parseItem($newsItem); $item = parent::parseItem($item);
// $articlePage gets the entire page's contents $articlePage = getSimpleHTMLDOM($item['uri']);
$articlePage = getSimpleHTMLDOM($newsItem->link);
// List of BS elements // List of BS elements
$uselessElements = [ $uselessElements = [
@ -33,7 +32,7 @@ class IGNBridge extends FeedExpander
'.jsx-4213937408', '.jsx-4213937408',
'.commerce-container', '.commerce-container',
'.widget-container', '.widget-container',
'.newsletter-signup-button' '.newsletter-signup-button',
]; ];
// Remove useless elements // Remove useless elements

View file

@ -12,9 +12,10 @@ class LeMondeInformatiqueBridge extends FeedExpander
$this->collectExpandableDatas(self::URI . 'rss/rss.xml', 10); $this->collectExpandableDatas(self::URI . 'rss/rss.xml', 10);
} }
protected function parseItem($newsItem) protected function parseItem($item)
{ {
$item = parent::parseItem($newsItem); $item = parent::parseItem($item);
$article_html = getSimpleHTMLDOMCached($item['uri']); $article_html = getSimpleHTMLDOMCached($item['uri']);
//Deduce thumbnail URL from article image URL //Deduce thumbnail URL from article image URL

View file

@ -13,12 +13,11 @@ class ListverseBridge extends FeedExpander
$this->collectExpandableDatas('https://listverse.com/feed/', 15); $this->collectExpandableDatas('https://listverse.com/feed/', 15);
} }
protected function parseItem($newsItem) protected function parseItem($item)
{ {
$item = parent::parseItem($newsItem); $item = parent::parseItem($item);
// $articlePage gets the entire page's contents $dom = getSimpleHTMLDOM($item['uri']);
$articlePage = getSimpleHTMLDOM($newsItem->link); $article = $dom->find('#articlecontentonly', 0);
$article = $articlePage->find('#articlecontentonly', 0);
$item['content'] = $article; $item['content'] = $article;
return $item; return $item;
} }

View file

@ -29,9 +29,11 @@ class MediapartBridge extends FeedExpander
$this->collectExpandableDatas($url); $this->collectExpandableDatas($url);
} }
protected function parseItem($newsItem) protected function parseItem($item)
{ {
$item = parent::parseItem($newsItem); $item = parent::parseItem($item);
$itemUrl = $item['uri'];
// Mediapart provide multiple type of contents. // Mediapart provide multiple type of contents.
// We only process items relative to the newspaper // We only process items relative to the newspaper
@ -49,12 +51,8 @@ class MediapartBridge extends FeedExpander
$opt = []; $opt = [];
$opt[CURLOPT_COOKIE] = 'MPSESSID=' . $mpsessid; $opt[CURLOPT_COOKIE] = 'MPSESSID=' . $mpsessid;
// Get the page $pageUrl = $itemUrl . '?onglet=full';
$articlePage = getSimpleHTMLDOM( $articlePage = getSimpleHTMLDOM($pageUrl, [], $opt);
$newsItem->link . '?onglet=full',
[],
$opt
);
// Extract the article content // Extract the article content
$content = $articlePage->find('div.content-article', 0)->innertext; $content = $articlePage->find('div.content-article', 0)->innertext;

View file

@ -22,17 +22,19 @@ class MsnMondeBridge extends FeedExpander
public function collectData() public function collectData()
{ {
$this->collectExpandableDatas(self::FEED_URL, self::LIMIT); $this->collectExpandableDatas(self::FEED_URL, 10);
} }
protected function parseItem($newsItem) protected function parseItem($item)
{ {
$item = parent::parseItem($newsItem); $item = parent::parseItem($item);
if (!preg_match('#fr-fr/actualite.*/ar-(?<id>[\w]*)\?#', $item['uri'], $matches)) { if (!preg_match('#fr-fr/actualite.*/ar-(?<id>[\w]*)\?#', $item['uri'], $matches)) {
return; return null;
} }
$json = json_decode(getContents(self::JSON_URL . $matches['id']), true); $jsonString = getContents(self::JSON_URL . $matches['id']);
$json = json_decode($jsonString, true);
$item['content'] = $json['body']; $item['content'] = $json['body'];
if (!empty($json['authors'])) { if (!empty($json['authors'])) {
$item['author'] = reset($json['authors'])['name']; $item['author'] = reset($json['authors'])['name'];

View file

@ -10,17 +10,18 @@ class NYTBridge extends FeedExpander
public function collectData() public function collectData()
{ {
$this->collectExpandableDatas('https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml', 40); $url = 'https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml';
$this->collectExpandableDatas($url, 40);
} }
protected function parseItem($newsItem) protected function parseItem($item)
{ {
$item = parent::parseItem($newsItem); $item = parent::parseItem($item);
$article = ''; $article = '';
// $articlePage gets the entire page's contents
try { try {
$articlePage = getSimpleHTMLDOM($newsItem->link); $articlePage = getSimpleHTMLDOM($item['uri']);
} catch (HttpException $e) { } catch (HttpException $e) {
// 403 Forbidden, This means we got anti-bot response // 403 Forbidden, This means we got anti-bot response
if ($e->getCode() === 403) { if ($e->getCode() === 403) {

View file

@ -88,9 +88,10 @@ class NextInpactBridge extends FeedExpander
$this->collectExpandableDatas($url, $limit); $this->collectExpandableDatas($url, $limit);
} }
protected function parseItem($newsItem) protected function parseItem($item)
{ {
$item = parent::parseItem($newsItem); $item = parent::parseItem($item);
$item['content'] = $this->extractContent($item, $item['uri']); $item['content'] = $this->extractContent($item, $item['uri']);
if (is_null($item['content'])) { if (is_null($item['content'])) {
return null; //Filtered article return null; //Filtered article

View file

@ -26,7 +26,8 @@ class NextgovBridge extends FeedExpander
public function collectData() public function collectData()
{ {
$this->collectExpandableDatas(self::URI . 'rss/' . $this->getInput('category') . '/', 10); $url = self::URI . 'rss/' . $this->getInput('category') . '/';
$this->collectExpandableDatas($url, 10);
} }
protected function parseItem($newsItem) protected function parseItem($newsItem)

View file

@ -12,9 +12,10 @@ class NiceMatinBridge extends FeedExpander
$this->collectExpandableDatas(self::URI . 'derniere-minute/rss', 10); $this->collectExpandableDatas(self::URI . 'derniere-minute/rss', 10);
} }
protected function parseItem($newsItem) protected function parseItem($item)
{ {
$item = parent::parseItem($newsItem); $item = parent::parseItem($item);
$item['content'] = $this->extractContent($item['uri']); $item['content'] = $this->extractContent($item['uri']);
return $item; return $item;
} }

View file

@ -65,12 +65,14 @@ class NyaaTorrentsBridge extends FeedExpander
$this->collectExpandableDatas($this->getURI(), 20); $this->collectExpandableDatas($this->getURI(), 20);
} }
protected function parseItem($newItem) protected function parseItem($newsItem)
{ {
$item = parent::parseItem($newItem); $item = parent::parseItem($newsItem);
$nyaaFields = (array)($newsItem->children('nyaa', true));
$item['id'] = str_replace(['https://nyaa.si/download/', '.torrent'], '', $item['uri']); $item['id'] = str_replace(['https://nyaa.si/download/', '.torrent'], '', $item['uri']);
$nyaaFields = (array)($newItem->children('nyaa', true));
$item = array_merge($item, $nyaaFields); $item = array_merge($item, $nyaaFields);
// Convert URI from torrent file to web page // Convert URI from torrent file to web page

View file

@ -117,18 +117,18 @@ class OnVaSortirBridge extends FeedExpander
] ]
]; ];
public function collectData()
{
$url = 'https://' . $this->getInput('city') . '.onvasortir.com/rss.php';
$this->collectExpandableDatas($url);
}
protected function parseItem($item) protected function parseItem($item)
{ {
$item = parent::parseItem($item); $item = parent::parseItem($item);
$html = getSimpleHTMLDOMCached($item['uri']); $dom = getSimpleHTMLDOMCached($item['uri']);
$text = $html->find('div.corpsMax', 0)->innertext; $text = $dom->find('div.corpsMax', 0)->innertext;
$item['content'] = utf8_encode($text); $item['content'] = utf8_encode($text);
return $item; return $item;
} }
public function collectData()
{
$this->collectExpandableDatas('https://' .
$this->getInput('city') . '.onvasortir.com/rss.php');
}
} }

View file

@ -29,22 +29,25 @@ but some RSS readers don\'t support this. "img" tag are supported by most browse
$this->collectExpandableDatas('https://www.phoronix.com/rss.php', $this->getInput('n')); $this->collectExpandableDatas('https://www.phoronix.com/rss.php', $this->getInput('n'));
} }
protected function parseItem($newsItem) protected function parseItem($item)
{ {
$item = parent::parseItem($newsItem); $item = parent::parseItem($item);
// $articlePage gets the entire page's contents
$articlePage = getSimpleHTMLDOM($newsItem->link); $itemUrl = $item['uri'];
$articlePage = getSimpleHTMLDOM($itemUrl);
$articlePage = defaultLinkTo($articlePage, $this->getURI()); $articlePage = defaultLinkTo($articlePage, $this->getURI());
// Extract final link. From Facebook's like plugin. // Extract final link. From Facebook's like plugin.
parse_str(parse_url($articlePage->find('iframe[src^=//www.facebook.com/plugins]', 0), PHP_URL_QUERY), $facebookQuery); $parsedUrlQuery = parse_url($articlePage->find('iframe[src^=//www.facebook.com/plugins]', 0), PHP_URL_QUERY);
parse_str($parsedUrlQuery, $facebookQuery);
if (array_key_exists('href', $facebookQuery)) { if (array_key_exists('href', $facebookQuery)) {
$newsItem->link = $facebookQuery['href']; $itemUrl = $facebookQuery['href'];
} }
$item['content'] = $this->extractContent($articlePage); $item['content'] = $this->extractContent($articlePage);
$pages = $articlePage->find('.pagination a[!title]'); $pages = $articlePage->find('.pagination a[!title]');
foreach ($pages as $page) { foreach ($pages as $page) {
$pageURI = urljoin($newsItem->link, html_entity_decode($page->href)); $pageURI = urljoin($itemUrl, html_entity_decode($page->href));
$page = getSimpleHTMLDOM($pageURI); $page = getSimpleHTMLDOM($pageURI);
$item['content'] .= $this->extractContent($page); $item['content'] .= $this->extractContent($page);
} }

View file

@ -6,9 +6,15 @@ class QwantzBridge extends FeedExpander
const URI = 'https://qwantz.com/'; const URI = 'https://qwantz.com/';
const DESCRIPTION = 'Latest comic.'; const DESCRIPTION = 'Latest comic.';
protected function parseItem($feedItem) public function collectData()
{ {
$item = parent::parseItem($feedItem); $this->collectExpandableDatas(self::URI . 'rssfeed.php');
}
protected function parseItem($item)
{
$item = parent::parseItem($item);
$item['author'] = 'Ryan North'; $item['author'] = 'Ryan North';
preg_match('/title="(.*?)"/', $item['content'], $matches); preg_match('/title="(.*?)"/', $item['content'], $matches);
@ -25,11 +31,6 @@ class QwantzBridge extends FeedExpander
return $item; return $item;
} }
public function collectData()
{
$this->collectExpandableDatas(self::URI . 'rssfeed.php');
}
public function getIcon() public function getIcon()
{ {
return self::URI . 'favicon.ico'; return self::URI . 'favicon.ico';

View file

@ -12,12 +12,11 @@ class RaceDepartmentBridge extends FeedExpander
$this->collectExpandableDatas('https://www.racedepartment.com/ams/index.rss', 10); $this->collectExpandableDatas('https://www.racedepartment.com/ams/index.rss', 10);
} }
protected function parseItem($feedItem) protected function parseItem($item)
{ {
$item = parent::parseItem($feedItem); $item = parent::parseItem($item);
//fetch page $articlePage = getSimpleHTMLDOMCached($item['uri']);
$articlePage = getSimpleHTMLDOMCached($feedItem->link);
$coverImage = $articlePage->find('img.js-articleCoverImage', 0); $coverImage = $articlePage->find('img.js-articleCoverImage', 0);
#relative url -> absolute url #relative url -> absolute url

View file

@ -42,9 +42,9 @@ class ScribbleHubBridge extends FeedExpander
$this->collectExpandableDatas($url); $this->collectExpandableDatas($url);
} }
protected function parseItem($newItem) protected function parseItem($item)
{ {
$item = parent::parseItem($newItem); $item = parent::parseItem($item);
//For series, filter out other series from 'All' feed //For series, filter out other series from 'All' feed
if ( if (
@ -57,7 +57,7 @@ class ScribbleHubBridge extends FeedExpander
$item['comments'] = $item['uri'] . '#comments'; $item['comments'] = $item['uri'] . '#comments';
try { try {
$item_html = getSimpleHTMLDOMCached($item['uri']); $dom = getSimpleHTMLDOMCached($item['uri']);
} catch (HttpException $e) { } catch (HttpException $e) {
// 403 Forbidden, This means we got anti-bot response // 403 Forbidden, This means we got anti-bot response
if ($e->getCode() === 403) { if ($e->getCode() === 403) {
@ -66,22 +66,22 @@ class ScribbleHubBridge extends FeedExpander
throw $e; throw $e;
} }
$item_html = defaultLinkTo($item_html, self::URI); $dom = defaultLinkTo($dom, self::URI);
//Retrieve full description from page contents //Retrieve full description from page contents
$item['content'] = $item_html->find('#chp_raw', 0); $item['content'] = $dom->find('#chp_raw', 0);
//Retrieve image for thumbnail //Retrieve image for thumbnail
$item_image = $item_html->find('.s_novel_img > img', 0)->src; $item_image = $dom->find('.s_novel_img > img', 0)->src;
$item['enclosures'] = [$item_image]; $item['enclosures'] = [$item_image];
//Restore lost categories //Restore lost categories
$item_story = html_entity_decode($item_html->find('.chp_byauthor > a', 0)->innertext); $item_story = html_entity_decode($dom->find('.chp_byauthor > a', 0)->innertext);
$item_sid = $item_html->find('#mysid', 0)->value; $item_sid = $dom->find('#mysid', 0)->value;
$item['categories'] = [$item_story, $item_sid]; $item['categories'] = [$item_story, $item_sid];
//Generate UID //Generate UID
$item_pid = $item_html->find('#mypostid', 0)->value; $item_pid = $dom->find('#mypostid', 0)->value;
$item['uid'] = $item_sid . "/$item_pid"; $item['uid'] = $item_sid . "/$item_pid";
return $item; return $item;

View file

@ -21,6 +21,12 @@ class SplCenterBridge extends FeedExpander
const CACHE_TIMEOUT = 3600; // 1 hour const CACHE_TIMEOUT = 3600; // 1 hour
public function collectData()
{
$url = $this->getURI() . '/rss.xml';
$this->collectExpandableDatas($url);
}
protected function parseItem($item) protected function parseItem($item)
{ {
$item = parent::parseItem($item); $item = parent::parseItem($item);
@ -37,11 +43,6 @@ class SplCenterBridge extends FeedExpander
return $item; return $item;
} }
public function collectData()
{
$this->collectExpandableDatas($this->getURI() . '/rss.xml');
}
public function getURI() public function getURI()
{ {
if (!is_null($this->getInput('content'))) { if (!is_null($this->getInput('content'))) {

View file

@ -30,14 +30,17 @@ class TapasBridge extends FeedExpander
protected $id; protected $id;
public function getURI() public function collectData()
{ {
if ($this->id) { if (preg_match('/^[\d]+$/', $this->getInput('title'))) {
return self::URI . 'rss/series/' . $this->id; $this->id = $this->getInput('title');
} else {
return self::URI . 'series/' . $this->getInput('title') . '/info/';
} }
return self::URI; if ($this->getInput('force_title') or !$this->id) {
$html = getSimpleHTMLDOM($this->getURI()) or returnServerError('Could not request ' . $this->getURI());
$this->id = $html->find('meta[property$=":url"]', 0)->content;
$this->id = str_ireplace(['tapastic://series/', '/info'], '', $this->id);
}
$this->collectExpandableDatas($this->getURI());
} }
protected function parseItem($feedItem) protected function parseItem($feedItem)
@ -72,16 +75,13 @@ class TapasBridge extends FeedExpander
return $item; return $item;
} }
public function collectData() public function getURI()
{ {
if (preg_match('/^[\d]+$/', $this->getInput('title'))) { if ($this->id) {
$this->id = $this->getInput('title'); return self::URI . 'rss/series/' . $this->id;
} else {
return self::URI . 'series/' . $this->getInput('title') . '/info/';
} }
if ($this->getInput('force_title') or !$this->id) { return self::URI;
$html = getSimpleHTMLDOM($this->getURI()) or returnServerError('Could not request ' . $this->getURI());
$this->id = $html->find('meta[property$=":url"]', 0)->content;
$this->id = str_ireplace(['tapastic://series/', '/info'], '', $this->id);
}
$this->collectExpandableDatas($this->getURI());
} }
} }

View file

@ -52,18 +52,15 @@ class TheGuardianBridge extends FeedExpander
public function collectData() public function collectData()
{ {
$feed = $this->getInput('feed'); $feed = $this->getInput('feed');
$feedURL = 'https://feeds.theguardian.com/theguardian/' . $feed; $url = 'https://feeds.theguardian.com/theguardian/' . $feed;
$this->collectExpandableDatas($feedURL, 10); $this->collectExpandableDatas($url, 10);
} }
protected function parseItem($newsItem) protected function parseItem($item)
{ {
$item = parent::parseItem($newsItem); $item = parent::parseItem($item);
// --- Recovering the article --- $articlePage = getSimpleHTMLDOM($item['uri']);
// $articlePage gets the entire page's contents
$articlePage = getSimpleHTMLDOM($newsItem->link);
// figure contain's the main article image // figure contain's the main article image
$article = $articlePage->find('figure', 0); $article = $articlePage->find('figure', 0);
// content__article-body has the actual article // content__article-body has the actual article

View file

@ -8,18 +8,24 @@ class TwitterEngineeringBridge extends FeedExpander
const DESCRIPTION = 'Returns the newest articles.'; const DESCRIPTION = 'Returns the newest articles.';
const CACHE_TIMEOUT = 21600; // 6h const CACHE_TIMEOUT = 21600; // 6h
public function collectData()
{
$url = 'https://blog.twitter.com/engineering/en_us/blog.rss';
$this->collectExpandableDatas($url);
}
protected function parseItem($item) protected function parseItem($item)
{ {
$item = parent::parseItem($item); $item = parent::parseItem($item);
$article_html = getSimpleHTMLDOMCached($item['uri']); $dom = getSimpleHTMLDOMCached($item['uri']);
if (!$article_html) { if (!$dom) {
$item['content'] .= '<p><em>Could not request ' . $this->getName() . ': ' . $item['uri'] . '</em></p>'; $item['content'] .= '<p><em>Could not request ' . $this->getName() . ': ' . $item['uri'] . '</em></p>';
return $item; return $item;
} }
$article_html = defaultLinkTo($article_html, $this->getURI()); $dom = defaultLinkTo($dom, $this->getURI());
$article_body = $article_html->find('div.column.column-6', 0); $article_body = $dom->find('div.column.column-6', 0);
// Remove elements that are not part of article content // Remove elements that are not part of article content
$unwanted_selector = 'div.bl02-blog-post-text-masthead, div.tweet-error-text, div.bl13-tweet-template'; $unwanted_selector = 'div.bl02-blog-post-text-masthead, div.tweet-error-text, div.bl13-tweet-template';
@ -33,8 +39,8 @@ class TwitterEngineeringBridge extends FeedExpander
} }
$item['content'] = $article_body; $item['content'] = $article_body;
$item['timestamp'] = strtotime($article_html->find('span.b02-blog-post-no-masthead__date', 0)->innertext); $item['timestamp'] = strtotime($dom->find('span.b02-blog-post-no-masthead__date', 0)->innertext);
$item['categories'] = self::getCategoriesFromTags($article_html); $item['categories'] = self::getCategoriesFromTags($dom);
return $item; return $item;
} }
@ -53,12 +59,6 @@ class TwitterEngineeringBridge extends FeedExpander
return $categories; return $categories;
} }
public function collectData()
{
$feed = static::URI . 'en_us/blog.rss';
$this->collectExpandableDatas($feed);
}
public function getName() public function getName()
{ {
// Else the original feed returns "English (US)" as the title // Else the original feed returns "English (US)" as the title

View file

@ -13,11 +13,11 @@ class VarietyBridge extends FeedExpander
$this->collectExpandableDatas('https://feeds.feedburner.com/variety/headlines', 15); $this->collectExpandableDatas('https://feeds.feedburner.com/variety/headlines', 15);
} }
protected function parseItem($newsItem) protected function parseItem($item)
{ {
$item = parent::parseItem($newsItem); $item = parent::parseItem($item);
// $articlePage gets the entire page's contents // $articlePage gets the entire page's contents
$articlePage = getSimpleHTMLDOM($newsItem->link); $articlePage = getSimpleHTMLDOM($item['uri']);
// Remove Script tags // Remove Script tags
foreach ($articlePage->find('script') as $script_tag) { foreach ($articlePage->find('script') as $script_tag) {

View file

@ -32,14 +32,14 @@ class ViceBridge extends FeedExpander
$this->collectExpandableDatas($feedURL, 10); $this->collectExpandableDatas($feedURL, 10);
} }
protected function parseItem($newsItem) protected function parseItem($item)
{ {
$item = parent::parseItem($newsItem); $item = parent::parseItem($item);
// $articlePage gets the entire page's contents
$articlePage = getSimpleHTMLDOM($newsItem->link); $articlePage = getSimpleHTMLDOM($item['uri']);
// text and embedded content // text and embedded content
$article = $articlePage->find('.article__body', 0); $article = $articlePage->find('.article__body', 0);
$item['content'] = $article; $item['content'] = $article ?? '';
return $item; return $item;
} }

View file

@ -50,13 +50,16 @@ class WiredBridge extends FeedExpander
$this->collectExpandableDatas($feed_url, $limit); $this->collectExpandableDatas($feed_url, $limit);
} }
protected function parseItem($newsItem) protected function parseItem($item)
{ {
$item = parent::parseItem($newsItem); $item = parent::parseItem($item);
$originalContent = $item['content'];
$article = getSimpleHTMLDOMCached($item['uri']); $article = getSimpleHTMLDOMCached($item['uri']);
$item['content'] = $this->extractArticleContent($article); $item['content'] = $this->extractArticleContent($article);
$headline = strval($newsItem->description); $headline = $originalContent;
if (!empty($headline)) { if (!empty($headline)) {
$item['content'] = '<p><b>' . $headline . '</b></p>' . $item['content']; $item['content'] = '<p><b>' . $headline . '</b></p>' . $item['content'];
} }

View file

@ -20,50 +20,56 @@ class WordPressBridge extends FeedExpander
], ],
]]; ]];
private function cleanContent($content) public function collectData()
{ {
$content = stripWithDelimiters($content, '<script', '</script>'); $limit = $this->getInput('limit') ?? 10;
$content = preg_replace('/<div class="wpa".*/', '', $content); if ($this->getInput('url') && substr($this->getInput('url'), 0, strlen('http')) !== 'http') {
$content = preg_replace('/<form.*\/form>/', '', $content); // just in case someone find a way to access local files by playing with the url
return $content; returnClientError('The url parameter must either refer to http or https protocol.');
}
try {
$this->collectExpandableDatas($this->getURI() . '/feed/atom/', $limit);
} catch (Exception $e) {
$this->collectExpandableDatas($this->getURI() . '/?feed=atom', $limit);
}
} }
protected function parseItem($newItem) protected function parseItem($item)
{ {
$item = parent::parseItem($newItem); $item = parent::parseItem($item);
$article_html = getSimpleHTMLDOMCached($item['uri']); $dom = getSimpleHTMLDOMCached($item['uri']);
// Find article body // Find article body
$article = null; $article = null;
switch (true) { switch (true) {
case !empty($this->getInput('content-selector')): case !empty($this->getInput('content-selector')):
// custom contect selector (manually specified by user) // custom contect selector (manually specified by user)
$article = $article_html->find($this->getInput('content-selector'), 0); $article = $dom->find($this->getInput('content-selector'), 0);
break; break;
case !is_null($article_html->find('[itemprop=articleBody]', 0)): case !is_null($dom->find('[itemprop=articleBody]', 0)):
// highest priority content div (used for SEO) // highest priority content div (used for SEO)
$article = $article_html->find('[itemprop=articleBody]', 0); $article = $dom->find('[itemprop=articleBody]', 0);
break; break;
case !is_null($article_html->find('.article-content', 0)): case !is_null($dom->find('.article-content', 0)):
// more precise than article when present // more precise than article when present
$article = $article_html->find('.article-content', 0); $article = $dom->find('.article-content', 0);
break; break;
case !is_null($article_html->find('article', 0)): case !is_null($dom->find('article', 0)):
// most common content div // most common content div
$article = $article_html->find('article', 0); $article = $dom->find('article', 0);
break; break;
case !is_null($article_html->find('.single-content', 0)): case !is_null($dom->find('.single-content', 0)):
// another common content div // another common content div
$article = $article_html->find('.single-content', 0); $article = $dom->find('.single-content', 0);
break; break;
case !is_null($article_html->find('.post-content', 0)): case !is_null($dom->find('.post-content', 0)):
// another common content div // another common content div
$article = $article_html->find('.post-content', 0); $article = $dom->find('.post-content', 0);
break; break;
case !is_null($article_html->find('.post', 0)): case !is_null($dom->find('.post', 0)):
// for old WordPress themes without HTML5 // for old WordPress themes without HTML5
$article = $article_html->find('.post', 0); $article = $dom->find('.post', 0);
break; break;
} }
@ -76,7 +82,7 @@ class WordPressBridge extends FeedExpander
// Find article main image // Find article main image
$article = convertLazyLoading($article); $article = convertLazyLoading($article);
$article_image = $article_html->find('img.wp-post-image', 0); $article_image = $dom->find('img.wp-post-image', 0);
if (!empty($item['content']) && (!is_object($article_image) || empty($article_image->src))) { if (!empty($item['content']) && (!is_object($article_image) || empty($article_image->src))) {
$article_image = str_get_html($item['content'])->find('img.wp-post-image', 0); $article_image = str_get_html($item['content'])->find('img.wp-post-image', 0);
} }
@ -106,6 +112,14 @@ class WordPressBridge extends FeedExpander
return $item; return $item;
} }
private function cleanContent($content)
{
$content = stripWithDelimiters($content, '<script', '</script>');
$content = preg_replace('/<div class="wpa".*/', '', $content);
$content = preg_replace('/<form.*\/form>/', '', $content);
return $content;
}
public function getURI() public function getURI()
{ {
$url = $this->getInput('url'); $url = $this->getInput('url');
@ -114,18 +128,4 @@ class WordPressBridge extends FeedExpander
} }
return $url; return $url;
} }
public function collectData()
{
$limit = $this->getInput('limit') ?? 10;
if ($this->getInput('url') && substr($this->getInput('url'), 0, strlen('http')) !== 'http') {
// just in case someone find a way to access local files by playing with the url
returnClientError('The url parameter must either refer to http or https protocol.');
}
try {
$this->collectExpandableDatas($this->getURI() . '/feed/atom/', $limit);
} catch (Exception $e) {
$this->collectExpandableDatas($this->getURI() . '/?feed=atom', $limit);
}
}
} }

View file

@ -30,9 +30,9 @@ class WorldOfTanksBridge extends FeedExpander
$this->collectExpandableDatas(sprintf('https://worldoftanks.eu/%s/rss/news/', $this->getInput('lang'))); $this->collectExpandableDatas(sprintf('https://worldoftanks.eu/%s/rss/news/', $this->getInput('lang')));
} }
protected function parseItem($newsItem) protected function parseItem($item)
{ {
$item = parent::parseItem($newsItem); $item = parent::parseItem($item);
$item['content'] = $this->loadFullArticle($item['uri']); $item['content'] = $this->loadFullArticle($item['uri']);
return $item; return $item;
} }

View file

@ -50,19 +50,19 @@ class ZeitBridge extends FeedExpander
'defaultValue' => 5 'defaultValue' => 5
] ]
]]; ]];
const LIMIT = 5;
public function collectData() public function collectData()
{ {
$this->collectExpandableDatas( $url = $this->getInput('category');
$this->getInput('category'), $limit = $this->getInput('limit') ?: 5;
$this->getInput('limit') ?: static::LIMIT
); $this->collectExpandableDatas($url, $limit);
} }
protected function parseItem($item) protected function parseItem($item)
{ {
$item = parent::parseItem($item); $item = parent::parseItem($item);
$item['enclosures'] = []; $item['enclosures'] = [];
$headers = [ $headers = [