refactor: FeedExpander::parseItem() descendants (#3744)

This commit is contained in:
Dag 2023-10-13 00:25:34 +02:00 committed by GitHub
parent 9bda9e246a
commit 382648fc22
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
46 changed files with 314 additions and 281 deletions

View file

@ -20,17 +20,16 @@ class AcrimedBridge extends FeedExpander
public function collectData()
{
$this->collectExpandableDatas(
static::URI . 'spip.php?page=backend',
$this->getInput('limit')
);
$url = 'https://www.acrimed.org/spip.php?page=backend';
$limit = $this->getInput('limit');
$this->collectExpandableDatas($url, $limit);
}
protected function parseItem($newsItem)
protected function parseItem($item)
{
$item = parent::parseItem($newsItem);
$item = parent::parseItem($item);
$articlePage = getSimpleHTMLDOM($newsItem->link);
$articlePage = getSimpleHTMLDOM($item['uri']);
$article = sanitize($articlePage->find('article.article1', 0)->innertext);
$article = defaultLinkTo($article, static::URI);
$item['content'] = $article;

View file

@ -33,9 +33,9 @@ class ArsTechnicaBridge extends FeedExpander
$this->collectExpandableDatas($url);
}
protected function parseItem($newItem)
protected function parseItem($item)
{
$item = parent::parseItem($newItem);
$item = parent::parseItem($item);
$item_html = getSimpleHTMLDOMCached($item['uri'] . '&amp');
$item_html = defaultLinkTo($item_html, self::URI);

View file

@ -7,6 +7,12 @@ class BleepingComputerBridge extends FeedExpander
const URI = 'https://www.bleepingcomputer.com/';
const DESCRIPTION = 'Returns the newest articles.';
public function collectData()
{
$feed = static::URI . 'feed/';
$this->collectExpandableDatas($feed);
}
protected function parseItem($item)
{
$item = parent::parseItem($item);
@ -23,10 +29,4 @@ class BleepingComputerBridge extends FeedExpander
return $item;
}
public function collectData()
{
$feed = static::URI . 'feed/';
$this->collectExpandableDatas($feed);
}
}

View file

@ -43,9 +43,9 @@ class CNETFranceBridge extends FeedExpander
$this->collectExpandableDatas('https://www.cnetfrance.fr/feeds/rss/news/');
}
protected function parseItem($feedItem)
protected function parseItem($item)
{
$item = parent::parseItem($feedItem);
$item = parent::parseItem($item);
foreach ($this->bannedTitle as $term) {
if (preg_match('/' . $term . '/mi', $item['title']) === 1) {
@ -54,8 +54,7 @@ class CNETFranceBridge extends FeedExpander
}
foreach ($this->bannedURL as $term) {
$preg_match = preg_match('#' . $term . '#mi', $item['uri']);
if ($preg_match === 1) {
if (preg_match('#' . $term . '#mi', $item['uri'])) {
return null;
}
}

View file

@ -34,9 +34,9 @@ class CaschyBridge extends FeedExpander
);
}
protected function parseItem($feedItem)
protected function parseItem($item)
{
$item = parent::parseItem($feedItem);
$item = parent::parseItem($item);
if (strpos($item['uri'], 'https://stadt-bremerhaven.de/') !== 0) {
return $item;

View file

@ -12,9 +12,9 @@ class CommonDreamsBridge extends FeedExpander
$this->collectExpandableDatas('http://www.commondreams.org/rss.xml', 10);
}
protected function parseItem($newsItem)
protected function parseItem($item)
{
$item = parent::parseItem($newsItem);
$item = parent::parseItem($item);
$item['content'] = $this->extractContent($item['uri']);
return $item;
}

View file

@ -13,11 +13,11 @@ class CourrierInternationalBridge extends FeedExpander
$this->collectExpandableDatas(static::URI . 'feed/all/rss.xml', 20);
}
protected function parseItem($feedItem)
protected function parseItem($item)
{
$item = parent::parseItem($feedItem);
$item = parent::parseItem($item);
$articlePage = getSimpleHTMLDOMCached($feedItem->link);
$articlePage = getSimpleHTMLDOMCached($item['uri']);
$content = $articlePage->find('.article-text, depeche-text', 0);
if (!$content) {
return $item;

View file

@ -56,9 +56,10 @@ class DarkReadingBridge extends FeedExpander
$this->collectExpandableDatas($feed_url, $limit);
}
protected function parseItem($newsItem)
protected function parseItem($item)
{
$item = parent::parseItem($newsItem);
$item = parent::parseItem($item);
$article = getSimpleHTMLDOMCached($item['uri']);
$item['content'] = $this->extractArticleContent($article);
$item['enclosures'] = []; //remove author profile picture

View file

@ -43,9 +43,9 @@ class DauphineLibereBridge extends FeedExpander
$this->collectExpandableDatas($url, 10);
}
protected function parseItem($newsItem)
protected function parseItem($item)
{
$item = parent::parseItem($newsItem);
$item = parent::parseItem($item);
$item['content'] = $this->extractContent($item['uri']);
return $item;
}

View file

@ -163,19 +163,6 @@ class DeveloppezDotComBridge extends FeedExpander
]
];
/**
* Return the RSS url for selected domain
*/
private function getRssUrl()
{
$domain = $this->getInput('domain');
if (!empty($domain)) {
return 'https://' . $domain . self::DOMAIN . self::RSS_URL;
}
return self::URI . self::RSS_URL;
}
/**
* Grabs the RSS item from Developpez.com
*/
@ -189,15 +176,14 @@ class DeveloppezDotComBridge extends FeedExpander
* Parse the content of every RSS item. And will try to get the full article
* pointed by the item URL intead of the default abstract.
*/
protected function parseItem($newsItem)
protected function parseItem($item)
{
$item = parent::parseItem($item);
if (count($this->items) >= $this->getInput('limit')) {
return null;
}
// This function parse each entry in the RSS with the default parse
$item = parent::parseItem($newsItem);
// There is a bug in Developpez RSS, coma are writtent as '~?' in the
// title, so I have to fix it manually
$item['title'] = $this->fixComaInTitle($item['title']);
@ -229,6 +215,19 @@ class DeveloppezDotComBridge extends FeedExpander
return $item;
}
/**
* Return the RSS url for selected domain
*/
private function getRssUrl()
{
$domain = $this->getInput('domain');
if (!empty($domain)) {
return 'https://' . $domain . self::DOMAIN . self::RSS_URL;
}
return self::URI . self::RSS_URL;
}
/**
* Replace '~?' by a proper coma ','
*/
@ -334,6 +333,9 @@ class DeveloppezDotComBridge extends FeedExpander
*/
private function isHtmlTagNotTxt($txt)
{
if ($txt === '') {
return false;
}
$html = str_get_html($txt);
return $html && $html->root && count($html->root->children) > 0;
}

View file

@ -93,21 +93,22 @@ class EconomistBridge extends FeedExpander
$limit = 30;
}
$this->collectExpandableDatas('https://www.economist.com/' . $category . '/rss.xml', $limit);
$url = 'https://www.economist.com/' . $category . '/rss.xml';
$this->collectExpandableDatas($url, $limit);
}
protected function parseItem($feedItem)
protected function parseItem($item)
{
$item = parent::parseItem($feedItem);
$html = getSimpleHTMLDOM($item['uri']);
$item = parent::parseItem($item);
$dom = getSimpleHTMLDOM($item['uri']);
$article = $html->find('#new-article-template', 0);
$article = $dom->find('#new-article-template', 0);
if ($article == null) {
$article = $html->find('main', 0);
$article = $dom->find('main', 0);
}
if ($article) {
$elem = $article->find('div', 0);
list($content, $audio_url) = $this->processContent($html, $elem);
list($content, $audio_url) = $this->processContent($dom, $elem);
$item['content'] = $content;
if ($audio_url != null) {
$item['enclosures'] = [$audio_url];

View file

@ -10,26 +10,28 @@ class EngadgetBridge extends FeedExpander
public function collectData()
{
$url = 'https://www.engadget.com/rss.xml';
$max = 10;
$this->collectExpandableDatas(static::URI . 'rss.xml', $max);
$this->collectExpandableDatas($url, $max);
}
protected function parseItem($newsItem)
protected function parseItem($item)
{
$item = parent::parseItem($newsItem);
$url = (string) $newsItem->link;
if (!$url) {
$item = parent::parseItem($item);
$itemUrl = trim($item['uri']);
if (!$itemUrl) {
return $item;
}
// todo: remove querystring tracking
$articlePage = getSimpleHTMLDOM($url);
$dom = getSimpleHTMLDOM($itemUrl);
// figure contain's the main article image
$article = $articlePage->find('figure', 0);
$article = $dom->find('figure', 0);
// .article-text has the actual article
foreach ($articlePage->find('.article-text') as $element) {
foreach ($dom->find('.article-text') as $element) {
$article = $article . $element;
}
$item['content'] = $article;
$item['content'] = $article ?? '';
return $item;
}
}

View file

@ -1,5 +1,8 @@
<?php
/**
* Appears to be protected by cloudflare now
*/
class EsquerdaNetBridge extends FeedExpander
{
const MAINTAINER = 'somini';
@ -23,32 +26,16 @@ class EsquerdaNetBridge extends FeedExpander
]
];
public function getURI()
{
$type = $this->getInput('feed');
return self::URI . '/rss/' . $type;
}
public function getIcon()
{
return 'https://www.esquerda.net/sites/default/files/favicon_0.ico';
}
public function collectData()
{
parent::collectExpandableDatas($this->getURI());
}
protected function parseItem($newsItem)
protected function parseItem($item)
{
# Fix Publish date
$badDate = $newsItem->pubDate;
preg_match('|(?P<day>\d\d)/(?P<month>\d\d)/(?P<year>\d\d\d\d) - (?P<hour>\d\d):(?P<minute>\d\d)|', $badDate, $d);
$newsItem->pubDate = sprintf('%s-%s-%sT%s:%s', $d['year'], $d['month'], $d['day'], $d['hour'], $d['minute']);
$item = parent::parseItem($newsItem);
# Include all the content
$uri = $item['uri'];
$html = getSimpleHTMLDOMCached($uri);
$item = parent::parseItem($item);
$html = getSimpleHTMLDOMCached($item['uri']);
$content = $html->find('div#content div.content', 0);
## Fix author
$authorHTML = $html->find('.field-name-field-op-author a', 0);
@ -72,4 +59,15 @@ class EsquerdaNetBridge extends FeedExpander
$item['content'] = $content;
return $item;
}
public function getURI()
{
$type = $this->getInput('feed');
return self::URI . '/rss/' . $type;
}
public function getIcon()
{
return 'https://www.esquerda.net/sites/default/files/favicon_0.ico';
}
}

View file

@ -43,9 +43,4 @@ class FeedExpanderExampleBridge extends FeedExpander
returnClientError('Unknown version ' . $this->getInput('version') . '!');
}
}
protected function parseItem($newsItem)
{
return (array) $newsItem;
}
}

View file

@ -0,0 +1,23 @@
<?php
declare(strict_types=1);
class FeedExpanderTestBridge extends FeedExpander
{
const MAINTAINER = 'No maintainer';
const NAME = 'Unnamed bridge';
const URI = 'https://esdf.com/';
const DESCRIPTION = 'No description provided';
const PARAMETERS = [];
const CACHE_TIMEOUT = 3600;
public function collectData()
{
$url = 'http://static.userland.com/gems/backend/sampleRss.xml'; // rss 0.91
//$url = 'http://feeds.nature.com/nature/rss/current?format=xml'; // rss 1.0
//$url = 'https://dvikan.no/feed.xml'; // rss 2.0
//$url = 'https://nedlasting.geonorge.no/geonorge/Tjenestefeed.xml'; // atom
$this->collectExpandableDatas($url);
}
}

View file

@ -82,9 +82,9 @@ class FilterBridge extends FeedExpander
$this->collectExpandableDatas($this->getURI());
}
protected function parseItem($newItem)
protected function parseItem($item)
{
$item = parent::parseItem($newItem);
$item = parent::parseItem($item);
// Generate title from first 50 characters of content?
if ($this->getInput('title_from_content') && array_key_exists('content', $item)) {

View file

@ -12,12 +12,12 @@ class ForGifsBridge extends FeedExpander
$this->collectExpandableDatas('https://forgifs.com/gallery/srss/7');
}
protected function parseItem($feedItem)
protected function parseItem($item)
{
$item = parent::parseItem($feedItem);
$item = parent::parseItem($item);
$content = str_get_html($item['content']);
$img = $content->find('img', 0);
$dom = str_get_html($item['content']);
$img = $dom->find('img', 0);
$poster = $img->src;
// The actual gif is the same path but its id must be decremented by one.
@ -34,7 +34,7 @@ class ForGifsBridge extends FeedExpander
$img->width = 'auto';
$img->height = 'auto';
$item['content'] = $content;
$item['content'] = (string) $dom;
return $item;
}

View file

@ -14,15 +14,17 @@ class FreeCodeCampBridge extends FeedExpander
$this->collectExpandableDatas('https://www.freecodecamp.org/news/rss/', 15);
}
protected function parseItem($newsItem)
protected function parseItem($item)
{
$item = parent::parseItem($newsItem);
// $articlePage gets the entire page's contents
$articlePage = getSimpleHTMLDOM($newsItem->link);
$item = parent::parseItem($item);
$dom = getSimpleHTMLDOM($item['uri']);
// figure contain's the main article image
$article = $articlePage->find('figure', 0);
$article = $dom->find('figure', 0);
// the actual article
foreach ($articlePage->find('.post-full-content') as $element) {
foreach ($dom->find('.post-full-content') as $element) {
$article = $article . $element;
}
$item['content'] = $article;

View file

@ -85,13 +85,14 @@ class FuturaSciencesBridge extends FeedExpander
$this->collectExpandableDatas($url, 10);
}
protected function parseItem($newsItem)
protected function parseItem($item)
{
$item = parent::parseItem($newsItem);
$item = parent::parseItem($item);
$item['uri'] = str_replace('#xtor%3DRSS-8', '', $item['uri']);
$article = getSimpleHTMLDOMCached($item['uri']);
$item['content'] = $this->extractArticleContent($article);
$author = $this->extractAuthor($article);
$dom = getSimpleHTMLDOMCached($item['uri']);
$item['content'] = $this->extractArticleContent($dom);
$author = $this->extractAuthor($dom);
if (!empty($author)) {
$item['author'] = $author;
}

View file

@ -9,15 +9,15 @@ class HardwareInfoBridge extends FeedExpander
public function collectData()
{
$this->collectExpandableDatas('https://nl.hardware.info/updates/all.rss', 20);
$this->collectExpandableDatas('https://nl.hardware.info/updates/all.rss', 10);
}
protected function parseItem($feedItem)
protected function parseItem($item)
{
$item = parent::parseItem($feedItem);
$item = parent::parseItem($item);
//get full article
$articlePage = getSimpleHTMLDOMCached($feedItem->link);
$itemUrl = $item['uri'];
$articlePage = getSimpleHTMLDOMCached($itemUrl);
$article = $articlePage->find('div.article__content', 0);

View file

@ -125,9 +125,10 @@ class HeiseBridge extends FeedExpander
);
}
protected function parseItem($feedItem)
protected function parseItem($item)
{
$item = parent::parseItem($feedItem);
$item = parent::parseItem($item);
$sessioncookie = $this->getInput('sessioncookie');
// strip rss parameter

View file

@ -10,17 +10,16 @@ class IGNBridge extends FeedExpander
public function collectData()
{
$this->collectExpandableDatas('http://feeds.ign.com/ign/all', 15);
$this->collectExpandableDatas('http://feeds.ign.com/ign/all', 2);
}
// IGNs feed is both hidden and incomplete. This bridge tries to fix this.
protected function parseItem($newsItem)
protected function parseItem($item)
{
$item = parent::parseItem($newsItem);
$item = parent::parseItem($item);
// $articlePage gets the entire page's contents
$articlePage = getSimpleHTMLDOM($newsItem->link);
$articlePage = getSimpleHTMLDOM($item['uri']);
// List of BS elements
$uselessElements = [
@ -33,7 +32,7 @@ class IGNBridge extends FeedExpander
'.jsx-4213937408',
'.commerce-container',
'.widget-container',
'.newsletter-signup-button'
'.newsletter-signup-button',
];
// Remove useless elements

View file

@ -12,9 +12,10 @@ class LeMondeInformatiqueBridge extends FeedExpander
$this->collectExpandableDatas(self::URI . 'rss/rss.xml', 10);
}
protected function parseItem($newsItem)
protected function parseItem($item)
{
$item = parent::parseItem($newsItem);
$item = parent::parseItem($item);
$article_html = getSimpleHTMLDOMCached($item['uri']);
//Deduce thumbnail URL from article image URL

View file

@ -13,12 +13,11 @@ class ListverseBridge extends FeedExpander
$this->collectExpandableDatas('https://listverse.com/feed/', 15);
}
protected function parseItem($newsItem)
protected function parseItem($item)
{
$item = parent::parseItem($newsItem);
// $articlePage gets the entire page's contents
$articlePage = getSimpleHTMLDOM($newsItem->link);
$article = $articlePage->find('#articlecontentonly', 0);
$item = parent::parseItem($item);
$dom = getSimpleHTMLDOM($item['uri']);
$article = $dom->find('#articlecontentonly', 0);
$item['content'] = $article;
return $item;
}

View file

@ -29,9 +29,11 @@ class MediapartBridge extends FeedExpander
$this->collectExpandableDatas($url);
}
protected function parseItem($newsItem)
protected function parseItem($item)
{
$item = parent::parseItem($newsItem);
$item = parent::parseItem($item);
$itemUrl = $item['uri'];
// Mediapart provide multiple type of contents.
// We only process items relative to the newspaper
@ -49,12 +51,8 @@ class MediapartBridge extends FeedExpander
$opt = [];
$opt[CURLOPT_COOKIE] = 'MPSESSID=' . $mpsessid;
// Get the page
$articlePage = getSimpleHTMLDOM(
$newsItem->link . '?onglet=full',
[],
$opt
);
$pageUrl = $itemUrl . '?onglet=full';
$articlePage = getSimpleHTMLDOM($pageUrl, [], $opt);
// Extract the article content
$content = $articlePage->find('div.content-article', 0)->innertext;

View file

@ -22,17 +22,19 @@ class MsnMondeBridge extends FeedExpander
public function collectData()
{
$this->collectExpandableDatas(self::FEED_URL, self::LIMIT);
$this->collectExpandableDatas(self::FEED_URL, 10);
}
protected function parseItem($newsItem)
protected function parseItem($item)
{
$item = parent::parseItem($newsItem);
$item = parent::parseItem($item);
if (!preg_match('#fr-fr/actualite.*/ar-(?<id>[\w]*)\?#', $item['uri'], $matches)) {
return;
return null;
}
$json = json_decode(getContents(self::JSON_URL . $matches['id']), true);
$jsonString = getContents(self::JSON_URL . $matches['id']);
$json = json_decode($jsonString, true);
$item['content'] = $json['body'];
if (!empty($json['authors'])) {
$item['author'] = reset($json['authors'])['name'];

View file

@ -10,17 +10,18 @@ class NYTBridge extends FeedExpander
public function collectData()
{
$this->collectExpandableDatas('https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml', 40);
$url = 'https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml';
$this->collectExpandableDatas($url, 40);
}
protected function parseItem($newsItem)
protected function parseItem($item)
{
$item = parent::parseItem($newsItem);
$item = parent::parseItem($item);
$article = '';
// $articlePage gets the entire page's contents
try {
$articlePage = getSimpleHTMLDOM($newsItem->link);
$articlePage = getSimpleHTMLDOM($item['uri']);
} catch (HttpException $e) {
// 403 Forbidden, This means we got anti-bot response
if ($e->getCode() === 403) {

View file

@ -88,9 +88,10 @@ class NextInpactBridge extends FeedExpander
$this->collectExpandableDatas($url, $limit);
}
protected function parseItem($newsItem)
protected function parseItem($item)
{
$item = parent::parseItem($newsItem);
$item = parent::parseItem($item);
$item['content'] = $this->extractContent($item, $item['uri']);
if (is_null($item['content'])) {
return null; //Filtered article

View file

@ -26,7 +26,8 @@ class NextgovBridge extends FeedExpander
public function collectData()
{
$this->collectExpandableDatas(self::URI . 'rss/' . $this->getInput('category') . '/', 10);
$url = self::URI . 'rss/' . $this->getInput('category') . '/';
$this->collectExpandableDatas($url, 10);
}
protected function parseItem($newsItem)

View file

@ -12,9 +12,10 @@ class NiceMatinBridge extends FeedExpander
$this->collectExpandableDatas(self::URI . 'derniere-minute/rss', 10);
}
protected function parseItem($newsItem)
protected function parseItem($item)
{
$item = parent::parseItem($newsItem);
$item = parent::parseItem($item);
$item['content'] = $this->extractContent($item['uri']);
return $item;
}

View file

@ -65,12 +65,14 @@ class NyaaTorrentsBridge extends FeedExpander
$this->collectExpandableDatas($this->getURI(), 20);
}
protected function parseItem($newItem)
protected function parseItem($newsItem)
{
$item = parent::parseItem($newItem);
$item = parent::parseItem($newsItem);
$nyaaFields = (array)($newsItem->children('nyaa', true));
$item['id'] = str_replace(['https://nyaa.si/download/', '.torrent'], '', $item['uri']);
$nyaaFields = (array)($newItem->children('nyaa', true));
$item = array_merge($item, $nyaaFields);
// Convert URI from torrent file to web page

View file

@ -117,18 +117,18 @@ class OnVaSortirBridge extends FeedExpander
]
];
public function collectData()
{
$url = 'https://' . $this->getInput('city') . '.onvasortir.com/rss.php';
$this->collectExpandableDatas($url);
}
protected function parseItem($item)
{
$item = parent::parseItem($item);
$html = getSimpleHTMLDOMCached($item['uri']);
$text = $html->find('div.corpsMax', 0)->innertext;
$dom = getSimpleHTMLDOMCached($item['uri']);
$text = $dom->find('div.corpsMax', 0)->innertext;
$item['content'] = utf8_encode($text);
return $item;
}
public function collectData()
{
$this->collectExpandableDatas('https://' .
$this->getInput('city') . '.onvasortir.com/rss.php');
}
}

View file

@ -29,22 +29,25 @@ but some RSS readers don\'t support this. "img" tag are supported by most browse
$this->collectExpandableDatas('https://www.phoronix.com/rss.php', $this->getInput('n'));
}
protected function parseItem($newsItem)
protected function parseItem($item)
{
$item = parent::parseItem($newsItem);
// $articlePage gets the entire page's contents
$articlePage = getSimpleHTMLDOM($newsItem->link);
$item = parent::parseItem($item);
$itemUrl = $item['uri'];
$articlePage = getSimpleHTMLDOM($itemUrl);
$articlePage = defaultLinkTo($articlePage, $this->getURI());
// Extract final link. From Facebook's like plugin.
parse_str(parse_url($articlePage->find('iframe[src^=//www.facebook.com/plugins]', 0), PHP_URL_QUERY), $facebookQuery);
$parsedUrlQuery = parse_url($articlePage->find('iframe[src^=//www.facebook.com/plugins]', 0), PHP_URL_QUERY);
parse_str($parsedUrlQuery, $facebookQuery);
if (array_key_exists('href', $facebookQuery)) {
$newsItem->link = $facebookQuery['href'];
$itemUrl = $facebookQuery['href'];
}
$item['content'] = $this->extractContent($articlePage);
$pages = $articlePage->find('.pagination a[!title]');
foreach ($pages as $page) {
$pageURI = urljoin($newsItem->link, html_entity_decode($page->href));
$pageURI = urljoin($itemUrl, html_entity_decode($page->href));
$page = getSimpleHTMLDOM($pageURI);
$item['content'] .= $this->extractContent($page);
}

View file

@ -6,9 +6,15 @@ class QwantzBridge extends FeedExpander
const URI = 'https://qwantz.com/';
const DESCRIPTION = 'Latest comic.';
protected function parseItem($feedItem)
public function collectData()
{
$item = parent::parseItem($feedItem);
$this->collectExpandableDatas(self::URI . 'rssfeed.php');
}
protected function parseItem($item)
{
$item = parent::parseItem($item);
$item['author'] = 'Ryan North';
preg_match('/title="(.*?)"/', $item['content'], $matches);
@ -25,11 +31,6 @@ class QwantzBridge extends FeedExpander
return $item;
}
public function collectData()
{
$this->collectExpandableDatas(self::URI . 'rssfeed.php');
}
public function getIcon()
{
return self::URI . 'favicon.ico';

View file

@ -12,12 +12,11 @@ class RaceDepartmentBridge extends FeedExpander
$this->collectExpandableDatas('https://www.racedepartment.com/ams/index.rss', 10);
}
protected function parseItem($feedItem)
protected function parseItem($item)
{
$item = parent::parseItem($feedItem);
$item = parent::parseItem($item);
//fetch page
$articlePage = getSimpleHTMLDOMCached($feedItem->link);
$articlePage = getSimpleHTMLDOMCached($item['uri']);
$coverImage = $articlePage->find('img.js-articleCoverImage', 0);
#relative url -> absolute url

View file

@ -42,9 +42,9 @@ class ScribbleHubBridge extends FeedExpander
$this->collectExpandableDatas($url);
}
protected function parseItem($newItem)
protected function parseItem($item)
{
$item = parent::parseItem($newItem);
$item = parent::parseItem($item);
//For series, filter out other series from 'All' feed
if (
@ -57,7 +57,7 @@ class ScribbleHubBridge extends FeedExpander
$item['comments'] = $item['uri'] . '#comments';
try {
$item_html = getSimpleHTMLDOMCached($item['uri']);
$dom = getSimpleHTMLDOMCached($item['uri']);
} catch (HttpException $e) {
// 403 Forbidden, This means we got anti-bot response
if ($e->getCode() === 403) {
@ -66,22 +66,22 @@ class ScribbleHubBridge extends FeedExpander
throw $e;
}
$item_html = defaultLinkTo($item_html, self::URI);
$dom = defaultLinkTo($dom, self::URI);
//Retrieve full description from page contents
$item['content'] = $item_html->find('#chp_raw', 0);
$item['content'] = $dom->find('#chp_raw', 0);
//Retrieve image for thumbnail
$item_image = $item_html->find('.s_novel_img > img', 0)->src;
$item_image = $dom->find('.s_novel_img > img', 0)->src;
$item['enclosures'] = [$item_image];
//Restore lost categories
$item_story = html_entity_decode($item_html->find('.chp_byauthor > a', 0)->innertext);
$item_sid = $item_html->find('#mysid', 0)->value;
$item_story = html_entity_decode($dom->find('.chp_byauthor > a', 0)->innertext);
$item_sid = $dom->find('#mysid', 0)->value;
$item['categories'] = [$item_story, $item_sid];
//Generate UID
$item_pid = $item_html->find('#mypostid', 0)->value;
$item_pid = $dom->find('#mypostid', 0)->value;
$item['uid'] = $item_sid . "/$item_pid";
return $item;

View file

@ -21,6 +21,12 @@ class SplCenterBridge extends FeedExpander
const CACHE_TIMEOUT = 3600; // 1 hour
public function collectData()
{
$url = $this->getURI() . '/rss.xml';
$this->collectExpandableDatas($url);
}
protected function parseItem($item)
{
$item = parent::parseItem($item);
@ -37,11 +43,6 @@ class SplCenterBridge extends FeedExpander
return $item;
}
public function collectData()
{
$this->collectExpandableDatas($this->getURI() . '/rss.xml');
}
public function getURI()
{
if (!is_null($this->getInput('content'))) {

View file

@ -30,14 +30,17 @@ class TapasBridge extends FeedExpander
protected $id;
public function getURI()
public function collectData()
{
if ($this->id) {
return self::URI . 'rss/series/' . $this->id;
} else {
return self::URI . 'series/' . $this->getInput('title') . '/info/';
if (preg_match('/^[\d]+$/', $this->getInput('title'))) {
$this->id = $this->getInput('title');
}
return self::URI;
if ($this->getInput('force_title') or !$this->id) {
$html = getSimpleHTMLDOM($this->getURI()) or returnServerError('Could not request ' . $this->getURI());
$this->id = $html->find('meta[property$=":url"]', 0)->content;
$this->id = str_ireplace(['tapastic://series/', '/info'], '', $this->id);
}
$this->collectExpandableDatas($this->getURI());
}
protected function parseItem($feedItem)
@ -72,16 +75,13 @@ class TapasBridge extends FeedExpander
return $item;
}
public function collectData()
public function getURI()
{
if (preg_match('/^[\d]+$/', $this->getInput('title'))) {
$this->id = $this->getInput('title');
if ($this->id) {
return self::URI . 'rss/series/' . $this->id;
} else {
return self::URI . 'series/' . $this->getInput('title') . '/info/';
}
if ($this->getInput('force_title') or !$this->id) {
$html = getSimpleHTMLDOM($this->getURI()) or returnServerError('Could not request ' . $this->getURI());
$this->id = $html->find('meta[property$=":url"]', 0)->content;
$this->id = str_ireplace(['tapastic://series/', '/info'], '', $this->id);
}
$this->collectExpandableDatas($this->getURI());
return self::URI;
}
}

View file

@ -52,18 +52,15 @@ class TheGuardianBridge extends FeedExpander
public function collectData()
{
$feed = $this->getInput('feed');
$feedURL = 'https://feeds.theguardian.com/theguardian/' . $feed;
$this->collectExpandableDatas($feedURL, 10);
$url = 'https://feeds.theguardian.com/theguardian/' . $feed;
$this->collectExpandableDatas($url, 10);
}
protected function parseItem($newsItem)
protected function parseItem($item)
{
$item = parent::parseItem($newsItem);
$item = parent::parseItem($item);
// --- Recovering the article ---
// $articlePage gets the entire page's contents
$articlePage = getSimpleHTMLDOM($newsItem->link);
$articlePage = getSimpleHTMLDOM($item['uri']);
// figure contain's the main article image
$article = $articlePage->find('figure', 0);
// content__article-body has the actual article

View file

@ -8,18 +8,24 @@ class TwitterEngineeringBridge extends FeedExpander
const DESCRIPTION = 'Returns the newest articles.';
const CACHE_TIMEOUT = 21600; // 6h
public function collectData()
{
$url = 'https://blog.twitter.com/engineering/en_us/blog.rss';
$this->collectExpandableDatas($url);
}
protected function parseItem($item)
{
$item = parent::parseItem($item);
$article_html = getSimpleHTMLDOMCached($item['uri']);
if (!$article_html) {
$dom = getSimpleHTMLDOMCached($item['uri']);
if (!$dom) {
$item['content'] .= '<p><em>Could not request ' . $this->getName() . ': ' . $item['uri'] . '</em></p>';
return $item;
}
$article_html = defaultLinkTo($article_html, $this->getURI());
$dom = defaultLinkTo($dom, $this->getURI());
$article_body = $article_html->find('div.column.column-6', 0);
$article_body = $dom->find('div.column.column-6', 0);
// Remove elements that are not part of article content
$unwanted_selector = 'div.bl02-blog-post-text-masthead, div.tweet-error-text, div.bl13-tweet-template';
@ -33,8 +39,8 @@ class TwitterEngineeringBridge extends FeedExpander
}
$item['content'] = $article_body;
$item['timestamp'] = strtotime($article_html->find('span.b02-blog-post-no-masthead__date', 0)->innertext);
$item['categories'] = self::getCategoriesFromTags($article_html);
$item['timestamp'] = strtotime($dom->find('span.b02-blog-post-no-masthead__date', 0)->innertext);
$item['categories'] = self::getCategoriesFromTags($dom);
return $item;
}
@ -53,12 +59,6 @@ class TwitterEngineeringBridge extends FeedExpander
return $categories;
}
public function collectData()
{
$feed = static::URI . 'en_us/blog.rss';
$this->collectExpandableDatas($feed);
}
public function getName()
{
// Else the original feed returns "English (US)" as the title

View file

@ -13,11 +13,11 @@ class VarietyBridge extends FeedExpander
$this->collectExpandableDatas('https://feeds.feedburner.com/variety/headlines', 15);
}
protected function parseItem($newsItem)
protected function parseItem($item)
{
$item = parent::parseItem($newsItem);
$item = parent::parseItem($item);
// $articlePage gets the entire page's contents
$articlePage = getSimpleHTMLDOM($newsItem->link);
$articlePage = getSimpleHTMLDOM($item['uri']);
// Remove Script tags
foreach ($articlePage->find('script') as $script_tag) {

View file

@ -32,14 +32,14 @@ class ViceBridge extends FeedExpander
$this->collectExpandableDatas($feedURL, 10);
}
protected function parseItem($newsItem)
protected function parseItem($item)
{
$item = parent::parseItem($newsItem);
// $articlePage gets the entire page's contents
$articlePage = getSimpleHTMLDOM($newsItem->link);
$item = parent::parseItem($item);
$articlePage = getSimpleHTMLDOM($item['uri']);
// text and embedded content
$article = $articlePage->find('.article__body', 0);
$item['content'] = $article;
$item['content'] = $article ?? '';
return $item;
}

View file

@ -50,13 +50,16 @@ class WiredBridge extends FeedExpander
$this->collectExpandableDatas($feed_url, $limit);
}
protected function parseItem($newsItem)
protected function parseItem($item)
{
$item = parent::parseItem($newsItem);
$item = parent::parseItem($item);
$originalContent = $item['content'];
$article = getSimpleHTMLDOMCached($item['uri']);
$item['content'] = $this->extractArticleContent($article);
$headline = strval($newsItem->description);
$headline = $originalContent;
if (!empty($headline)) {
$item['content'] = '<p><b>' . $headline . '</b></p>' . $item['content'];
}

View file

@ -20,50 +20,56 @@ class WordPressBridge extends FeedExpander
],
]];
private function cleanContent($content)
public function collectData()
{
$content = stripWithDelimiters($content, '<script', '</script>');
$content = preg_replace('/<div class="wpa".*/', '', $content);
$content = preg_replace('/<form.*\/form>/', '', $content);
return $content;
$limit = $this->getInput('limit') ?? 10;
if ($this->getInput('url') && substr($this->getInput('url'), 0, strlen('http')) !== 'http') {
// just in case someone find a way to access local files by playing with the url
returnClientError('The url parameter must either refer to http or https protocol.');
}
try {
$this->collectExpandableDatas($this->getURI() . '/feed/atom/', $limit);
} catch (Exception $e) {
$this->collectExpandableDatas($this->getURI() . '/?feed=atom', $limit);
}
}
protected function parseItem($newItem)
protected function parseItem($item)
{
$item = parent::parseItem($newItem);
$item = parent::parseItem($item);
$article_html = getSimpleHTMLDOMCached($item['uri']);
$dom = getSimpleHTMLDOMCached($item['uri']);
// Find article body
$article = null;
switch (true) {
case !empty($this->getInput('content-selector')):
// custom contect selector (manually specified by user)
$article = $article_html->find($this->getInput('content-selector'), 0);
$article = $dom->find($this->getInput('content-selector'), 0);
break;
case !is_null($article_html->find('[itemprop=articleBody]', 0)):
case !is_null($dom->find('[itemprop=articleBody]', 0)):
// highest priority content div (used for SEO)
$article = $article_html->find('[itemprop=articleBody]', 0);
$article = $dom->find('[itemprop=articleBody]', 0);
break;
case !is_null($article_html->find('.article-content', 0)):
case !is_null($dom->find('.article-content', 0)):
// more precise than article when present
$article = $article_html->find('.article-content', 0);
$article = $dom->find('.article-content', 0);
break;
case !is_null($article_html->find('article', 0)):
case !is_null($dom->find('article', 0)):
// most common content div
$article = $article_html->find('article', 0);
$article = $dom->find('article', 0);
break;
case !is_null($article_html->find('.single-content', 0)):
case !is_null($dom->find('.single-content', 0)):
// another common content div
$article = $article_html->find('.single-content', 0);
$article = $dom->find('.single-content', 0);
break;
case !is_null($article_html->find('.post-content', 0)):
case !is_null($dom->find('.post-content', 0)):
// another common content div
$article = $article_html->find('.post-content', 0);
$article = $dom->find('.post-content', 0);
break;
case !is_null($article_html->find('.post', 0)):
case !is_null($dom->find('.post', 0)):
// for old WordPress themes without HTML5
$article = $article_html->find('.post', 0);
$article = $dom->find('.post', 0);
break;
}
@ -76,7 +82,7 @@ class WordPressBridge extends FeedExpander
// Find article main image
$article = convertLazyLoading($article);
$article_image = $article_html->find('img.wp-post-image', 0);
$article_image = $dom->find('img.wp-post-image', 0);
if (!empty($item['content']) && (!is_object($article_image) || empty($article_image->src))) {
$article_image = str_get_html($item['content'])->find('img.wp-post-image', 0);
}
@ -106,6 +112,14 @@ class WordPressBridge extends FeedExpander
return $item;
}
private function cleanContent($content)
{
$content = stripWithDelimiters($content, '<script', '</script>');
$content = preg_replace('/<div class="wpa".*/', '', $content);
$content = preg_replace('/<form.*\/form>/', '', $content);
return $content;
}
public function getURI()
{
$url = $this->getInput('url');
@ -114,18 +128,4 @@ class WordPressBridge extends FeedExpander
}
return $url;
}
public function collectData()
{
$limit = $this->getInput('limit') ?? 10;
if ($this->getInput('url') && substr($this->getInput('url'), 0, strlen('http')) !== 'http') {
// just in case someone find a way to access local files by playing with the url
returnClientError('The url parameter must either refer to http or https protocol.');
}
try {
$this->collectExpandableDatas($this->getURI() . '/feed/atom/', $limit);
} catch (Exception $e) {
$this->collectExpandableDatas($this->getURI() . '/?feed=atom', $limit);
}
}
}

View file

@ -30,9 +30,9 @@ class WorldOfTanksBridge extends FeedExpander
$this->collectExpandableDatas(sprintf('https://worldoftanks.eu/%s/rss/news/', $this->getInput('lang')));
}
protected function parseItem($newsItem)
protected function parseItem($item)
{
$item = parent::parseItem($newsItem);
$item = parent::parseItem($item);
$item['content'] = $this->loadFullArticle($item['uri']);
return $item;
}

View file

@ -50,19 +50,19 @@ class ZeitBridge extends FeedExpander
'defaultValue' => 5
]
]];
const LIMIT = 5;
public function collectData()
{
$this->collectExpandableDatas(
$this->getInput('category'),
$this->getInput('limit') ?: static::LIMIT
);
$url = $this->getInput('category');
$limit = $this->getInput('limit') ?: 5;
$this->collectExpandableDatas($url, $limit);
}
protected function parseItem($item)
{
$item = parent::parseItem($item);
$item['enclosures'] = [];
$headers = [