rss-bridge/bridges/GolemBridge.php
Mynacol c5f586497f [GolemBridge] Remove multi-page page headers
On multi-page articles like [1], all the pages after the first one have
a page header that we add in the article content. When we tack the
pages together again, we don't need those extra page headers.

[1] https://www.golem.de/news/science-fiction-die-zehn-besten-filme-aus-den-spannenden-70ern-2312-179557.html
2023-12-16 11:21:19 +01:00

138 lines
4.6 KiB
PHP

<?php
class GolemBridge extends FeedExpander
{
const MAINTAINER = 'Mynacol';
const NAME = 'Golem Bridge';
const URI = 'https://www.golem.de/';
const CACHE_TIMEOUT = 1800; // 30min
const DESCRIPTION = 'Returns the full articles instead of only the intro';
const PARAMETERS = [[
'category' => [
'name' => 'Category',
'type' => 'list',
'values' => [
'Alle News'
=> 'https://rss.golem.de/rss.php?feed=ATOM1.0',
'Audio/Video'
=> 'https://rss.golem.de/rss.php?ms=audio-video&feed=ATOM1.0',
'Auto'
=> 'https://rss.golem.de/rss.php?ms=auto&feed=ATOM1.0',
'Foto'
=> 'https://rss.golem.de/rss.php?ms=foto&feed=ATOM1.0',
'Games'
=> 'https://rss.golem.de/rss.php?ms=games&feed=ATOM1.0',
'Handy'
=> 'https://rss.golem.de/rss.php?ms=handy&feed=ATOM1.0',
'Internet'
=> 'https://rss.golem.de/rss.php?ms=internet&feed=ATOM1.0',
'Mobil'
=> 'https://rss.golem.de/rss.php?ms=mobil&feed=ATOM1.0',
'Open Source'
=> 'https://rss.golem.de/rss.php?ms=open-source&feed=ATOM1.0',
'Politik/Recht'
=> 'https://rss.golem.de/rss.php?ms=politik-recht&feed=ATOM1.0',
'Security'
=> 'https://rss.golem.de/rss.php?ms=security&feed=ATOM1.0',
'Desktop-Applikationen'
=> 'https://rss.golem.de/rss.php?ms=desktop-applikationen&feed=ATOM1.0',
'Software-Entwicklung'
=> 'https://rss.golem.de/rss.php?ms=softwareentwicklung&feed=ATOM1.0',
'Wirtschaft'
=> 'https://rss.golem.de/rss.php?ms=wirtschaft&feed=ATOM1.0',
'Wissenschaft'
=> 'https://rss.golem.de/rss.php?ms=wissenschaft&feed=ATOM1.0'
]
],
'limit' => [
'name' => 'Limit',
'type' => 'number',
'required' => false,
'title' => 'Specify number of full articles to return',
'defaultValue' => 5
]
]];
const LIMIT = 5;
const HEADERS = ['Cookie: golem_consent20=simple|220101;'];
public function collectData()
{
$this->collectExpandableDatas(
$this->getInput('category'),
$this->getInput('limit') ?: static::LIMIT
);
}
protected function parseItem(array $item)
{
$item['content'] ??= '';
$uri = $item['uri'];
$urls = [];
while ($uri) {
if (isset($urls[$uri])) {
// Prevent forever a loop
break;
}
$urls[$uri] = true;
$articlePage = getSimpleHTMLDOMCached($uri, static::CACHE_TIMEOUT, static::HEADERS);
// URI without RSS feed reference
$item['uri'] = $articlePage->find('head meta[name="twitter:url"]', 0)->content;
$categories = $articlePage->find('ul.tags__list li');
foreach ($categories as $category) {
$trimmedcategories[] = trim(html_entity_decode($category->plaintext));
}
if (isset($trimmedcategories)) {
$item['categories'] = array_unique($trimmedcategories);
}
$item['content'] .= $this->extractContent($articlePage);
// next page
$nextUri = $articlePage->find('link[rel="next"]', 0);
$uri = $nextUri ? static::URI . $nextUri->href : null;
}
return $item;
}
private function extractContent($page)
{
$item = '';
$article = $page->find('article', 0);
// delete known bad elements
foreach (
$article->find('div[id*="adtile"], #job-market, #seminars, iframe,
div.gbox_affiliate, div.toc, .embedcontent, script') as $bad
) {
$bad->remove();
}
// reload html, as remove() is buggy
$article = str_get_html($article->outertext);
$header = $article->find('header', 0);
foreach ($header->find('p, figure') as $element) {
$item .= $element;
}
$content = $article->find('div.formatted', 0);
// full image quality
foreach ($content->find('img[data-src-full][src*="."]') as $img) {
$img->src = $img->getAttribute('data-src-full');
}
foreach ($content->find('p, h1, h2, h3, img[src*="."]') as $element) {
$item .= $element;
}
return $item;
}
}