rss-bridge/bridges/GolemBridge.php
Mynacol 8865521b3b
[GolemBridge] Remove image galleries (#2761)
Do not add all images of the image gallery, but only the preselected one.

Often, the same gallery is used multiple times with different preselected
images. The previous implementation always added all images of the
gallery, cluttering the article. This patch only adds the preselected one.

The no-js link wrapping around the gallery leads to a 403 Forbidden
page, so linking that doesn't work to really support galleries.
2022-06-04 22:27:24 +02:00

125 lines
3.6 KiB
PHP

<?php
class GolemBridge extends FeedExpander {
const MAINTAINER = 'Mynacol';
const NAME = 'Golem Bridge';
const URI = 'https://www.golem.de/';
const CACHE_TIMEOUT = 1800; // 30min
const DESCRIPTION = 'Returns the full articles instead of only the intro';
const PARAMETERS = array(array(
'category' => array(
'name' => 'Category',
'type' => 'list',
'values' => array(
'Alle News'
=> 'https://rss.golem.de/rss.php?feed=ATOM1.0',
'Audio/Video'
=> 'https://rss.golem.de/rss.php?ms=audio-video&feed=ATOM1.0',
'Auto'
=> 'https://rss.golem.de/rss.php?ms=auto&feed=ATOM1.0',
'Foto'
=> 'https://rss.golem.de/rss.php?ms=foto&feed=ATOM1.0',
'Games'
=> 'https://rss.golem.de/rss.php?ms=games&feed=ATOM1.0',
'Handy'
=> 'https://rss.golem.de/rss.php?ms=handy&feed=ATOM1.0',
'Internet'
=> 'https://rss.golem.de/rss.php?ms=internet&feed=ATOM1.0',
'Mobil'
=> 'https://rss.golem.de/rss.php?ms=mobil&feed=ATOM1.0',
'Open Source'
=> 'https://rss.golem.de/rss.php?ms=open-source&feed=ATOM1.0',
'Politik/Recht'
=> 'https://rss.golem.de/rss.php?ms=politik-recht&feed=ATOM1.0',
'Security'
=> 'https://rss.golem.de/rss.php?ms=security&feed=ATOM1.0',
'Desktop-Applikationen'
=> 'https://rss.golem.de/rss.php?ms=desktop-applikationen&feed=ATOM1.0',
'Software-Entwicklung'
=> 'https://rss.golem.de/rss.php?ms=softwareentwicklung&feed=ATOM1.0',
'Wirtschaft'
=> 'https://rss.golem.de/rss.php?ms=wirtschaft&feed=ATOM1.0',
'Wissenschaft'
=> 'https://rss.golem.de/rss.php?ms=wissenschaft&feed=ATOM1.0'
)
),
'limit' => array(
'name' => 'Limit',
'type' => 'number',
'required' => false,
'title' => 'Specify number of full articles to return',
'defaultValue' => 5
)
));
const LIMIT = 5;
const HEADERS = array('Cookie: golem_consent20=simple|220101;');
public function collectData() {
$this->collectExpandableDatas(
$this->getInput('category'),
$this->getInput('limit') ?: static::LIMIT
);
}
protected function parseItem($item) {
$item = parent::parseItem($item);
$item['content'] = $item['content'] ?? '';
$uri = $item['uri'];
while ($uri) {
$articlePage = getSimpleHTMLDOMCached($uri, static::CACHE_TIMEOUT, static::HEADERS);
// URI without RSS feed reference
$item['uri'] = $articlePage->find('head meta[name="twitter:url"]', 0)->content;
$author = $articlePage->find('article header .authors .authors__name', 0);
if ($author) {
$item['author'] = $author->innertext;
}
$item['content'] .= $this->extractContent($articlePage);
// next page
$nextUri = $articlePage->find('link[rel="next"]', 0);
$uri = $nextUri ? static::URI . $nextUri->href : null;
}
return $item;
}
private function extractContent($page) {
$item = '';
$article = $page->find('article', 0);
// delete known bad elements
foreach($article->find('div[id*="adtile"], #job-market, #seminars,
div.gbox_affiliate, div.toc, .embedcontent') as $bad) {
$bad->remove();
}
// reload html, as remove() is buggy
$article = str_get_html($article->outertext);
if ($pageHeader = $article->find('header.paged-cluster-header h1', 0)) {
$item .= $pageHeader;
}
$header = $article->find('header', 0);
foreach($header->find('p, figure') as $element) {
$item .= $element;
}
$content = $article->find('div.formatted', 0);
// full image quality
foreach($content->find('img[data-src-full][src*="."]') as $img) {
$img->src = $img->getAttribute('data-src-full');
}
foreach($content->find('p, h1, h3, img[src*="."]') as $element) {
$item .= $element;
}
return $item;
}
}