mirror of
https://github.com/RSS-Bridge/rss-bridge.git
synced 2024-12-18 17:10:29 +03:00
56994b3b5c
The original feed contains a small version of the header image and the summary or a literal "None". The header image is already added, but the original content was kept. This removes the original content and adds the summary if it exists.
146 lines
4.9 KiB
PHP
146 lines
4.9 KiB
PHP
<?php
|
|
|
|
class ZeitBridge extends FeedExpander
|
|
{
|
|
const MAINTAINER = 'Mynacol';
|
|
const NAME = 'Zeit Online Bridge';
|
|
const URI = 'https://www.zeit.de/';
|
|
const CACHE_TIMEOUT = 1800; // 30min
|
|
const DESCRIPTION = 'Returns the full articles instead of only the intro';
|
|
const PARAMETERS = [[
|
|
'category' => [
|
|
'name' => 'Category',
|
|
'type' => 'list',
|
|
'values' => [
|
|
'Startseite'
|
|
=> 'https://newsfeed.zeit.de/index',
|
|
'Politik'
|
|
=> 'https://newsfeed.zeit.de/politik/index',
|
|
'Wirtschaft'
|
|
=> 'https://newsfeed.zeit.de/wirtschaft/index',
|
|
'Gesellschaft'
|
|
=> 'https://newsfeed.zeit.de/gesellschaft/index',
|
|
'Kultur'
|
|
=> 'https://newsfeed.zeit.de/kultur/index',
|
|
'Wissen'
|
|
=> 'https://newsfeed.zeit.de/wissen/index',
|
|
'Digital'
|
|
=> 'https://newsfeed.zeit.de/digital/index',
|
|
'ZEIT Campus ONLINE'
|
|
=> 'https://newsfeed.zeit.de/campus/index',
|
|
'ZEIT ONLINE Arbeit'
|
|
=> 'https://newsfeed.zeit.de/arbeit/index',
|
|
'ZEIT Magazin ONLINE'
|
|
=> 'https://newsfeed.zeit.de/zeit-magazin/index',
|
|
'Entdecken'
|
|
=> 'https://newsfeed.zeit.de/entdecken/index',
|
|
'Mobilität'
|
|
=> 'https://newsfeed.zeit.de/mobilitaet/index',
|
|
'Sport'
|
|
=> 'https://newsfeed.zeit.de/sport/index',
|
|
'Alle Inhalte'
|
|
=> 'https://newsfeed.zeit.de/all'
|
|
]
|
|
],
|
|
'limit' => [
|
|
'name' => 'Limit',
|
|
'type' => 'number',
|
|
'required' => false,
|
|
'title' => 'Specify number of full articles to return',
|
|
'defaultValue' => 5
|
|
]
|
|
]];
|
|
|
|
public function collectData()
|
|
{
|
|
$url = $this->getInput('category');
|
|
$limit = $this->getInput('limit') ?: 5;
|
|
|
|
$this->collectExpandableDatas($url, $limit);
|
|
}
|
|
|
|
protected function parseItem(array $item)
|
|
{
|
|
$item['enclosures'] = [];
|
|
|
|
$headers = [
|
|
'Cookie: zonconsent=' . date('Y-m-d\TH:i:s.v\Z'),
|
|
];
|
|
|
|
// one-page article
|
|
$article = getSimpleHTMLDOM($item['uri'], $headers);
|
|
if ($article->find('a[href="' . $item['uri'] . '/komplettansicht"]', 0)) {
|
|
$item['uri'] .= '/komplettansicht';
|
|
$article = getSimpleHTMLDOM($item['uri'], $headers);
|
|
}
|
|
|
|
$article = defaultLinkTo($article, $item['uri']);
|
|
$item = $this->parseArticle($item, $article);
|
|
|
|
return $item;
|
|
}
|
|
|
|
private function parseArticle($item, $article)
|
|
{
|
|
$article = $article->find('main', 0);
|
|
|
|
// remove known bad elements
|
|
foreach (
|
|
$article->find(
|
|
'aside, .visually-hidden, .carousel-container, #tickaroo-liveblog, .zplus-badge,
|
|
.article-heading__container--podcast, .podcast-player__image, div[data-paywall],
|
|
.js-embed-consent, script, nav, .article-flexible-toc__subheading-link, .faq-link'
|
|
) as $bad
|
|
) {
|
|
$bad->remove();
|
|
}
|
|
// reload html, as remove() is buggy
|
|
$article = str_get_html($article->outertext);
|
|
|
|
// podcast audio, if available
|
|
$podcast_src = $article->find('.article-heading__podcast audio[src]', 0);
|
|
if ($podcast_src) {
|
|
$item['enclosures'][] = $podcast_src->src;
|
|
}
|
|
|
|
// full res images
|
|
foreach ($article->find('img[data-src]') as $img) {
|
|
$img->src = $img->getAttribute('data-src');
|
|
$item['enclosures'][] = $img->src;
|
|
}
|
|
|
|
// authors
|
|
$authors = $article->find('*[itemtype*="schema.org/Person"]') ?? $article->find('.metadata__source');
|
|
if ($authors) {
|
|
$item['author'] = implode(', ', array_map(function ($e) {
|
|
return trim($e->plaintext);
|
|
}, $authors));
|
|
}
|
|
|
|
$item['content'] = '';
|
|
|
|
// summary
|
|
$summary = $article->find('.summary');
|
|
if ($summary) {
|
|
$item['content'] .= implode('', $summary);
|
|
}
|
|
|
|
// header image
|
|
$headerimg = $article->find('*[data-ct-row="headerimage"]', 0) ?? $article->find('.article-header', 0) ?? $article->find('header', 0);
|
|
if ($headerimg) {
|
|
$item['content'] .= implode('', $headerimg->find('img[src], figcaption'));
|
|
}
|
|
|
|
// article content
|
|
$pages = $article->find('.article-page');
|
|
|
|
if ($pages) {
|
|
foreach ($pages as $page) {
|
|
$elements = $page->find('p, ul, ol, h2, figure.article__media img[src], figure.article__media figcaption, figure.quote');
|
|
$item['content'] .= implode('', $elements);
|
|
}
|
|
}
|
|
|
|
return $item;
|
|
}
|
|
}
|