rss-bridge/bridges/NordbayernBridge.php

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

178 lines
6.1 KiB
PHP
Raw Normal View History

2020-03-31 22:14:16 +03:00
<?php
2020-03-31 22:14:16 +03:00
class NordbayernBridge extends BridgeAbstract
{
const MAINTAINER = 'schabi.org';
const NAME = 'Nordbayern';
2020-03-31 22:14:16 +03:00
const CACHE_TIMEOUT = 3600;
const URI = 'https://www.nordbayern.de';
const DESCRIPTION = 'Bridge for Bavarian regional news site nordbayern.de';
2020-03-31 22:14:16 +03:00
const PARAMETERS = [ [
'region' => [
'name' => 'region',
'type' => 'list',
'exampleValue' => 'Nürnberg',
'title' => 'Select a region',
'values' => [
'Nürnberg' => 'nuernberg',
'Fürth' => 'fuerth',
'Erlangen' => 'erlangen',
2020-03-31 22:14:16 +03:00
'Altdorf' => 'altdorf',
'Ansbach' => 'ansbach',
'Bad Windsheim' => 'bad-windsheim',
'Bamberg' => 'bamberg',
'Dinkelsbühl/Feuchtwangen' => 'dinkelsbuehl-feuchtwangen',
'Feucht' => 'feucht',
'Forchheim' => 'forchheim',
'Gunzenhausen' => 'gunzenhausen',
'Hersbruck' => 'hersbruck',
'Herzogenaurach' => 'herzogenaurach',
'Hilpoltstein' => 'hilpoltstein',
2020-03-31 22:14:16 +03:00
'Höchstadt' => 'hoechstadt',
'Lauf' => 'lauf',
'Neumarkt' => 'neumarkt',
'Neustadt/Aisch' => 'neustadt-aisch',
'Pegnitz' => 'pegnitz',
'Roth' => 'roth',
'Rothenburg o.d.T.' => 'rothenburg-o-d-t',
'Treuchtlingen' => 'treuchtlingen',
'Weißenburg' => 'weissenburg'
]
],
2020-03-31 22:14:16 +03:00
'policeReports' => [
'name' => 'Police Reports',
'type' => 'checkbox',
'exampleValue' => 'checked',
'title' => 'Include Police Reports',
]
]];
2020-03-31 22:14:16 +03:00
private function getValidImage($picture)
{
2020-03-31 22:14:16 +03:00
$img = $picture->find('img', 0);
2022-05-02 20:06:30 +03:00
if ($img) {
2020-03-31 22:14:16 +03:00
$imgUrl = $img->src;
if (!preg_match('#/logo-.*\.png#', $imgUrl)) {
return '<br><img src="' . $imgUrl . '">';
}
}
2020-03-31 22:14:16 +03:00
return '';
}
2020-03-31 22:14:16 +03:00
2022-05-02 20:06:30 +03:00
private function getUseFullContent($rawContent)
{
$content = '';
2022-05-02 20:06:30 +03:00
foreach ($rawContent->children as $element) {
if (
($element->tag === 'p' || $element->tag === 'h3') &&
2022-05-02 20:06:30 +03:00
$element->class !== 'article__teaser'
) {
$content .= $element;
2022-05-02 20:06:30 +03:00
} elseif ($element->tag === 'main') {
$content .= self::getUseFullContent($element->find('article', 0));
} elseif ($element->tag === 'header') {
$content .= self::getUseFullContent($element);
} elseif (
2022-05-02 20:06:30 +03:00
$element->tag === 'div' &&
!str_contains($element->class, 'article__infobox') &&
!str_contains($element->class, 'authorinfo')
) {
2022-06-04 21:50:16 +03:00
$content .= self::getUseFullContent($element);
} elseif (
$element->tag === 'section' &&
2022-06-04 21:50:16 +03:00
(str_contains($element->class, 'article__richtext') ||
str_contains($element->class, 'article__context'))
) {
2022-05-02 20:06:30 +03:00
$content .= self::getUseFullContent($element);
2022-06-04 21:50:16 +03:00
} elseif ($element->tag === 'picture') {
2022-05-02 20:06:30 +03:00
$content .= self::getValidImage($element);
}
}
2022-05-02 20:06:30 +03:00
return $content;
}
private function getTeaser($content)
{
$teaser = $content->find('p[class=article__teaser]', 0);
2022-05-02 20:06:30 +03:00
if ($teaser === null) {
return '';
2022-05-02 20:06:30 +03:00
}
$teaser = $teaser->plaintext;
$teaser = preg_replace('/[ ]{2,}/', ' ', $teaser);
$teaser = '<p class="article__teaser">' . $teaser . '</p>';
return $teaser;
2020-03-31 22:14:16 +03:00
}
2022-06-04 21:50:16 +03:00
private function handleArticle($link)
{
$item = [];
$article = getSimpleHTMLDOM($link);
defaultLinkTo($article, self::URI);
$content = $article->find('article[id=article]', 0);
$item['uri'] = $link;
2022-05-02 20:06:30 +03:00
$author = $article->find('.article__author', 1);
2022-06-04 21:50:16 +03:00
if ($author !== null) {
2020-03-31 22:14:16 +03:00
$item['author'] = trim($author->plaintext);
}
2022-06-04 21:50:16 +03:00
$createdAt = $article->find('[class=article__release]', 0);
if ($createdAt) {
$item['timestamp'] = strtotime(str_replace('Uhr', '', $createdAt->plaintext));
}
if ($article->find('h2', 0) === null) {
$item['title'] = $article->find('h3', 0)->innertext;
} else {
$item['title'] = $article->find('h2', 0)->innertext;
}
$item['content'] = '';
2022-06-04 21:50:16 +03:00
if ($article->find('section[class*=article__richtext]', 0) === null) {
$content = $article->find('div[class*=modul__teaser]', 0)
->find('p', 0);
$item['content'] .= $content;
} else {
$content = $article->find('article', 0);
// change order of article teaser in order to show it on top
// of the title image. If we didn't do this some rss programs
// would show the subtitle of the title image as teaser instead
// of the actuall article teaser.
$item['content'] .= self::getTeaser($content);
2020-03-31 22:14:16 +03:00
$item['content'] .= self::getUseFullContent($content);
}
2020-03-31 22:14:16 +03:00
// exclude police reports if desired
if (
$this->getInput('policeReports') ||
!str_contains($item['content'], 'Hier geht es zu allen aktuellen Polizeimeldungen.')
) {
$this->items[] = $item;
}
$article->clear();
2020-03-31 22:14:16 +03:00
}
private function handleNewsblock($listSite)
{
2020-03-31 22:14:16 +03:00
$main = $listSite->find('main', 0);
foreach ($main->find('article') as $article) {
$url = $article->find('a', 0)->href;
$url = urljoin(self::URI, $url);
self::handleArticle($url);
2020-03-31 22:14:16 +03:00
}
}
2020-03-31 22:14:16 +03:00
public function collectData()
{
$region = $this->getInput('region');
if ($region === 'rothenburg-o-d-t') {
$region = 'rothenburg-ob-der-tauber';
}
$url = self::URI . '/region/' . $region;
$listSite = getSimpleHTMLDOM($url);
2020-03-31 22:14:16 +03:00
self::handleNewsblock($listSite);
2020-03-31 22:14:16 +03:00
}
}