2022-09-21 23:24:11 +03:00
|
|
|
<?php
|
|
|
|
|
|
|
|
class ZeitBridge extends FeedExpander
|
|
|
|
{
|
|
|
|
const MAINTAINER = 'Mynacol';
|
|
|
|
const NAME = 'Zeit Online Bridge';
|
|
|
|
const URI = 'https://www.zeit.de/';
|
|
|
|
const CACHE_TIMEOUT = 1800; // 30min
|
|
|
|
const DESCRIPTION = 'Returns the full articles instead of only the intro';
|
|
|
|
const PARAMETERS = [[
|
|
|
|
'category' => [
|
|
|
|
'name' => 'Category',
|
|
|
|
'type' => 'list',
|
|
|
|
'values' => [
|
|
|
|
'Startseite'
|
|
|
|
=> 'https://newsfeed.zeit.de/index',
|
|
|
|
'Politik'
|
|
|
|
=> 'https://newsfeed.zeit.de/politik/index',
|
|
|
|
'Wirtschaft'
|
|
|
|
=> 'https://newsfeed.zeit.de/wirtschaft/index',
|
|
|
|
'Gesellschaft'
|
|
|
|
=> 'https://newsfeed.zeit.de/gesellschaft/index',
|
|
|
|
'Kultur'
|
|
|
|
=> 'https://newsfeed.zeit.de/kultur/index',
|
|
|
|
'Wissen'
|
|
|
|
=> 'https://newsfeed.zeit.de/wissen/index',
|
|
|
|
'Digital'
|
|
|
|
=> 'https://newsfeed.zeit.de/digital/index',
|
|
|
|
'ZEIT Campus ONLINE'
|
|
|
|
=> 'https://newsfeed.zeit.de/campus/index',
|
|
|
|
'ZEIT ONLINE Arbeit'
|
|
|
|
=> 'https://newsfeed.zeit.de/arbeit/index',
|
|
|
|
'ZEIT Magazin ONLINE'
|
|
|
|
=> 'https://newsfeed.zeit.de/zeit-magazin/index',
|
|
|
|
'Entdecken'
|
|
|
|
=> 'https://newsfeed.zeit.de/entdecken/index',
|
|
|
|
'Mobilität'
|
|
|
|
=> 'https://newsfeed.zeit.de/mobilitaet/index',
|
|
|
|
'Sport'
|
|
|
|
=> 'https://newsfeed.zeit.de/sport/index',
|
|
|
|
'Alle Inhalte'
|
|
|
|
=> 'https://newsfeed.zeit.de/all'
|
|
|
|
]
|
|
|
|
],
|
|
|
|
'limit' => [
|
|
|
|
'name' => 'Limit',
|
|
|
|
'type' => 'number',
|
|
|
|
'required' => false,
|
|
|
|
'title' => 'Specify number of full articles to return',
|
|
|
|
'defaultValue' => 5
|
|
|
|
]
|
|
|
|
]];
|
|
|
|
|
|
|
|
public function collectData()
|
|
|
|
{
|
2023-10-13 01:25:34 +03:00
|
|
|
$url = $this->getInput('category');
|
|
|
|
$limit = $this->getInput('limit') ?: 5;
|
|
|
|
|
|
|
|
$this->collectExpandableDatas($url, $limit);
|
2022-09-21 23:24:11 +03:00
|
|
|
}
|
|
|
|
|
2023-10-13 02:59:05 +03:00
|
|
|
protected function parseItem(array $item)
|
2022-09-21 23:24:11 +03:00
|
|
|
{
|
|
|
|
$item['enclosures'] = [];
|
|
|
|
|
|
|
|
$headers = [
|
|
|
|
'Cookie: zonconsent=' . date('Y-m-d\TH:i:s.v\Z'),
|
2023-04-17 16:33:14 +03:00
|
|
|
];
|
2022-09-21 23:24:11 +03:00
|
|
|
|
|
|
|
// one-page article
|
|
|
|
$article = getSimpleHTMLDOM($item['uri'], $headers);
|
|
|
|
if ($article->find('a[href="' . $item['uri'] . '/komplettansicht"]', 0)) {
|
|
|
|
$item['uri'] .= '/komplettansicht';
|
|
|
|
$article = getSimpleHTMLDOM($item['uri'], $headers);
|
|
|
|
}
|
|
|
|
|
|
|
|
$article = defaultLinkTo($article, $item['uri']);
|
|
|
|
$item = $this->parseArticle($item, $article);
|
|
|
|
|
|
|
|
return $item;
|
|
|
|
}
|
|
|
|
|
|
|
|
private function parseArticle($item, $article)
|
|
|
|
{
|
|
|
|
$article = $article->find('main', 0);
|
|
|
|
|
|
|
|
// remove known bad elements
|
|
|
|
foreach (
|
|
|
|
$article->find(
|
2024-05-18 17:18:23 +03:00
|
|
|
'aside, .visually-hidden, .carousel-container, #tickaroo-liveblog, .zplus-badge,
|
|
|
|
.article-heading__container--podcast, .podcast-player__image, div[data-paywall],
|
|
|
|
.js-embed-consent, script, nav, .article-flexible-toc__subheading-link, .faq-link'
|
2022-09-21 23:24:11 +03:00
|
|
|
) as $bad
|
|
|
|
) {
|
|
|
|
$bad->remove();
|
|
|
|
}
|
|
|
|
// reload html, as remove() is buggy
|
|
|
|
$article = str_get_html($article->outertext);
|
|
|
|
|
|
|
|
// podcast audio, if available
|
|
|
|
$podcast_src = $article->find('.article-heading__podcast audio[src]', 0);
|
|
|
|
if ($podcast_src) {
|
|
|
|
$item['enclosures'][] = $podcast_src->src;
|
|
|
|
}
|
|
|
|
|
|
|
|
// full res images
|
|
|
|
foreach ($article->find('img[data-src]') as $img) {
|
|
|
|
$img->src = $img->getAttribute('data-src');
|
|
|
|
$item['enclosures'][] = $img->src;
|
|
|
|
}
|
|
|
|
|
|
|
|
// authors
|
2024-05-18 16:46:53 +03:00
|
|
|
$authors = $article->find('*[itemtype*="schema.org/Person"]') ?? $article->find('.metadata__source');
|
2022-09-21 23:24:11 +03:00
|
|
|
if ($authors) {
|
2024-05-18 17:18:23 +03:00
|
|
|
$item['author'] = implode(', ', array_map(function ($e) {
|
|
|
|
return trim($e->plaintext);
|
|
|
|
}, $authors));
|
2022-09-21 23:24:11 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
// header image
|
2024-05-18 17:11:26 +03:00
|
|
|
$headerimg = $article->find('*[data-ct-row="headerimage"]', 0) ?? $article->find('.article-header', 0) ?? $article->find('header', 0);
|
2022-09-21 23:24:11 +03:00
|
|
|
if ($headerimg) {
|
|
|
|
$item['content'] .= implode('', $headerimg->find('img[src], figcaption'));
|
|
|
|
}
|
|
|
|
|
|
|
|
// article content
|
|
|
|
$pages = $article->find('.article-page');
|
|
|
|
|
|
|
|
if ($pages) {
|
|
|
|
foreach ($pages as $page) {
|
2024-05-18 17:11:26 +03:00
|
|
|
$elements = $page->find('p, ul, ol, h2, figure.article__media img[src], figure.article__media figcaption, figure.quote');
|
2022-09-21 23:24:11 +03:00
|
|
|
$item['content'] .= implode('', $elements);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return $item;
|
|
|
|
}
|
|
|
|
}
|