rss-bridge/bridges/NordbayernBridge.php

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

214 lines
7.1 KiB
PHP
Raw Normal View History

2020-03-31 22:14:16 +03:00
<?php
2020-03-31 22:14:16 +03:00
class NordbayernBridge extends BridgeAbstract
{
const MAINTAINER = 'schabi.org';
const NAME = 'Nordbayern';
2020-03-31 22:14:16 +03:00
const CACHE_TIMEOUT = 3600;
const URI = 'https://www.nordbayern.de';
const DESCRIPTION = 'Bridge for Bavarian regional news site nordbayern.de';
2020-03-31 22:14:16 +03:00
const PARAMETERS = [ [
'region' => [
'name' => 'region',
'type' => 'list',
'exampleValue' => 'Nürnberg',
'title' => 'Select a region',
'values' => [
'Ansbach' => 'ansbach',
'Bamberg' => 'bamberg',
2022-08-13 20:41:05 +03:00
'Bayreuth' => 'bayreuth',
'Erlangen' => 'erlangen',
2020-03-31 22:14:16 +03:00
'Forchheim' => 'forchheim',
2022-08-13 20:41:05 +03:00
'Fürth' => 'fuerth',
2020-03-31 22:14:16 +03:00
'Gunzenhausen' => 'gunzenhausen',
'Herzogenaurach' => 'herzogenaurach',
'Höchstadt' => 'hoechstadt',
'Neumarkt' => 'neumarkt',
2022-08-13 20:41:05 +03:00
'Neustadt/Aisch-Bad Windsheim' => 'neustadt-aisch-bad-windsheim',
'Nürnberg' => 'nuernberg',
'Nürnberger Land' => 'nuernberger-land',
'Regensburg' => 'regensburg',
2020-03-31 22:14:16 +03:00
'Roth' => 'roth',
2022-08-13 20:41:05 +03:00
'Schwabach' => 'schwabach',
2020-03-31 22:14:16 +03:00
'Weißenburg' => 'weissenburg'
]
],
2020-03-31 22:14:16 +03:00
'policeReports' => [
'name' => 'Police Reports',
'type' => 'checkbox',
'exampleValue' => 'checked',
'title' => 'Include Police Reports',
2022-10-08 16:34:26 +03:00
],
'hideNNPlus' => [
'name' => 'Hide NN+ articles',
'type' => 'checkbox',
'exampleValue' => 'unchecked',
'title' => 'Hide all paywall articles on NN'
],
'hideDPA' => [
'name' => 'Hide dpa articles',
'type' => 'checkbox',
'exampleValue' => 'unchecked',
'title' => 'Hide external articles from dpa'
]
]];
2020-03-31 22:14:16 +03:00
public function collectData()
{
$region = $this->getInput('region');
if ($region === 'rothenburg-o-d-t') {
$region = 'rothenburg-ob-der-tauber';
}
$url = self::URI . '/region/' . $region;
$listSite = getSimpleHTMLDOM($url);
$this->handleNewsblock($listSite);
}
private function getValidImage($picture)
{
2020-03-31 22:14:16 +03:00
$img = $picture->find('img', 0);
2022-05-02 20:06:30 +03:00
if ($img) {
2020-03-31 22:14:16 +03:00
$imgUrl = $img->src;
if (!preg_match('#/logo-.*\.png#', $imgUrl)) {
return '<br><img src="' . $imgUrl . '">';
}
}
2020-03-31 22:14:16 +03:00
return '';
}
2020-03-31 22:14:16 +03:00
2022-05-02 20:06:30 +03:00
private function getUseFullContent($rawContent)
{
$content = '';
2022-05-02 20:06:30 +03:00
foreach ($rawContent->children as $element) {
if (
($element->tag === 'p' || $element->tag === 'h3') &&
2022-05-02 20:06:30 +03:00
$element->class !== 'article__teaser'
) {
$content .= $element;
2022-05-02 20:06:30 +03:00
} elseif ($element->tag === 'main') {
$content .= $this->getUseFullContent($element->find('article', 0));
2022-05-02 20:06:30 +03:00
} elseif ($element->tag === 'header') {
$content .= $this->getUseFullContent($element);
} elseif (
2022-05-02 20:06:30 +03:00
$element->tag === 'div' &&
!str_contains($element->class, 'article__infobox') &&
!str_contains($element->class, 'authorinfo')
) {
$content .= $this->getUseFullContent($element);
} elseif (
$element->tag === 'section' &&
2022-06-04 21:50:16 +03:00
(str_contains($element->class, 'article__richtext') ||
str_contains($element->class, 'article__context'))
) {
$content .= $this->getUseFullContent($element);
2022-06-04 21:50:16 +03:00
} elseif ($element->tag === 'picture') {
$content .= $this->getValidImage($element);
} elseif ($element->tag === 'ul') {
$content .= $element;
}
}
2022-05-02 20:06:30 +03:00
return $content;
}
private function getTeaser($content)
{
$teaser = $content->find('p[class=article__teaser]', 0);
2022-05-02 20:06:30 +03:00
if ($teaser === null) {
return '';
2022-05-02 20:06:30 +03:00
}
$teaser = $teaser->plaintext;
$teaser = preg_replace('/[ ]{2,}/', ' ', $teaser);
$teaser = '<p class="article__teaser">' . $teaser . '</p>';
return $teaser;
2020-03-31 22:14:16 +03:00
}
private function getArticle($link)
2022-06-04 21:50:16 +03:00
{
$item = [];
$article = getSimpleHTMLDOM($link);
defaultLinkTo($article, self::URI);
$content = $article->find('article[id=article]', 0);
$item['uri'] = $link;
2022-05-02 20:06:30 +03:00
$author = $article->find('.article__author', 1);
2022-06-04 21:50:16 +03:00
if ($author !== null) {
2020-03-31 22:14:16 +03:00
$item['author'] = trim($author->plaintext);
}
2022-06-04 21:50:16 +03:00
$createdAt = $article->find('[class=article__release]', 0);
if ($createdAt) {
$item['timestamp'] = strtotime(str_replace('Uhr', '', $createdAt->plaintext));
}
if ($article->find('h2', 0) === null) {
$item['title'] = $article->find('h3', 0)->innertext;
} else {
$item['title'] = $article->find('h2', 0)->innertext;
}
$item['content'] = '';
2022-06-04 21:50:16 +03:00
if ($article->find('section[class*=article__richtext]', 0) === null) {
$content = $article->find('div[class*=modul__teaser]', 0)
->find('p', 0);
$item['content'] .= $content;
} else {
$content = $article->find('article', 0);
// change order of article teaser in order to show it on top
// of the title image. If we didn't do this some rss programs
// would show the subtitle of the title image as teaser instead
// of the actuall article teaser.
$item['content'] .= $this->getTeaser($content);
$item['content'] .= $this->getUseFullContent($content);
}
2020-03-31 22:14:16 +03:00
$categories = $article->find('[class=themen]', 0);
if ($categories) {
$item['categories'] = [];
foreach ($categories->find('a') as $category) {
$item['categories'][] = $category->innertext;
}
}
$article->clear();
return $item;
2020-03-31 22:14:16 +03:00
}
private function handleNewsblock($listSite)
{
2020-03-31 22:14:16 +03:00
$main = $listSite->find('main', 0);
foreach ($main->find('article') as $article) {
$url = $article->find('a', 0)->href;
$url = urljoin(self::URI, $url);
2022-10-08 16:34:26 +03:00
// exclude nn+ articles if desired
if (
$this->getInput('hideNNPlus') &&
str_contains($url, 'www.nn.de')
) {
continue;
}
$item = $this->getArticle($url);
// exclude police reports if desired
if (
!$this->getInput('policeReports') &&
str_contains($item['content'], 'Hier geht es zu allen aktuellen Polizeimeldungen.')
2022-10-08 16:34:26 +03:00
) {
continue;
2022-10-08 16:34:26 +03:00
}
// exclude dpa articles
if (
$this->getInput('hideDPA') &&
str_contains($item['author'], 'dpa')
) {
continue;
}
$this->items[] = $item;
2020-03-31 22:14:16 +03:00
}
}
2020-03-31 22:14:16 +03:00
}