2020-03-31 22:14:16 +03:00
|
|
|
<?php
|
2022-04-13 00:40:37 +03:00
|
|
|
|
2020-03-31 22:14:16 +03:00
|
|
|
class NordbayernBridge extends BridgeAbstract
|
|
|
|
{
|
|
|
|
const MAINTAINER = 'schabi.org';
|
2021-07-01 05:21:58 +03:00
|
|
|
const NAME = 'Nordbayern';
|
2020-03-31 22:14:16 +03:00
|
|
|
const CACHE_TIMEOUT = 3600;
|
|
|
|
const URI = 'https://www.nordbayern.de';
|
2022-03-25 02:28:06 +03:00
|
|
|
const DESCRIPTION = 'Bridge for Bavarian regional news site nordbayern.de';
|
2020-03-31 22:14:16 +03:00
|
|
|
const PARAMETERS = [ [
|
|
|
|
'region' => [
|
|
|
|
'name' => 'region',
|
|
|
|
'type' => 'list',
|
|
|
|
'exampleValue' => 'Nürnberg',
|
|
|
|
'title' => 'Select a region',
|
|
|
|
'values' => [
|
|
|
|
'Ansbach' => 'ansbach',
|
|
|
|
'Bamberg' => 'bamberg',
|
2022-08-13 20:41:05 +03:00
|
|
|
'Bayreuth' => 'bayreuth',
|
|
|
|
'Erlangen' => 'erlangen',
|
2020-03-31 22:14:16 +03:00
|
|
|
'Forchheim' => 'forchheim',
|
2022-08-13 20:41:05 +03:00
|
|
|
'Fürth' => 'fuerth',
|
2020-03-31 22:14:16 +03:00
|
|
|
'Gunzenhausen' => 'gunzenhausen',
|
|
|
|
'Herzogenaurach' => 'herzogenaurach',
|
|
|
|
'Höchstadt' => 'hoechstadt',
|
|
|
|
'Neumarkt' => 'neumarkt',
|
2022-08-13 20:41:05 +03:00
|
|
|
'Neustadt/Aisch-Bad Windsheim' => 'neustadt-aisch-bad-windsheim',
|
|
|
|
'Nürnberg' => 'nuernberg',
|
|
|
|
'Nürnberger Land' => 'nuernberger-land',
|
|
|
|
'Regensburg' => 'regensburg',
|
2020-03-31 22:14:16 +03:00
|
|
|
'Roth' => 'roth',
|
2022-08-13 20:41:05 +03:00
|
|
|
'Schwabach' => 'schwabach',
|
2020-03-31 22:14:16 +03:00
|
|
|
'Weißenburg' => 'weissenburg'
|
2022-07-01 16:10:30 +03:00
|
|
|
]
|
|
|
|
],
|
2020-03-31 22:14:16 +03:00
|
|
|
'policeReports' => [
|
|
|
|
'name' => 'Police Reports',
|
|
|
|
'type' => 'checkbox',
|
|
|
|
'exampleValue' => 'checked',
|
2021-07-01 05:21:58 +03:00
|
|
|
'title' => 'Include Police Reports',
|
2022-10-08 16:34:26 +03:00
|
|
|
],
|
|
|
|
'hideNNPlus' => [
|
|
|
|
'name' => 'Hide NN+ articles',
|
|
|
|
'type' => 'checkbox',
|
|
|
|
'exampleValue' => 'unchecked',
|
|
|
|
'title' => 'Hide all paywall articles on NN'
|
2023-08-11 00:59:37 +03:00
|
|
|
],
|
|
|
|
'hideDPA' => [
|
|
|
|
'name' => 'Hide dpa articles',
|
|
|
|
'type' => 'checkbox',
|
|
|
|
'exampleValue' => 'unchecked',
|
|
|
|
'title' => 'Hide external articles from dpa'
|
2022-07-01 16:10:30 +03:00
|
|
|
]
|
|
|
|
]];
|
2020-03-31 22:14:16 +03:00
|
|
|
|
2024-08-07 16:51:44 +03:00
|
|
|
public function collectData()
|
|
|
|
{
|
|
|
|
$region = $this->getInput('region');
|
|
|
|
if ($region === 'rothenburg-o-d-t') {
|
|
|
|
$region = 'rothenburg-ob-der-tauber';
|
|
|
|
}
|
|
|
|
$url = self::URI . '/region/' . $region;
|
|
|
|
$listSite = getSimpleHTMLDOM($url);
|
|
|
|
|
|
|
|
$this->handleNewsblock($listSite);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2022-03-25 02:28:06 +03:00
|
|
|
private function getValidImage($picture)
|
2022-07-01 16:10:30 +03:00
|
|
|
{
|
2020-03-31 22:14:16 +03:00
|
|
|
$img = $picture->find('img', 0);
|
2022-05-02 20:06:30 +03:00
|
|
|
if ($img) {
|
2020-03-31 22:14:16 +03:00
|
|
|
$imgUrl = $img->src;
|
|
|
|
if (!preg_match('#/logo-.*\.png#', $imgUrl)) {
|
|
|
|
return '<br><img src="' . $imgUrl . '">';
|
2022-07-01 16:10:30 +03:00
|
|
|
}
|
|
|
|
}
|
2020-03-31 22:14:16 +03:00
|
|
|
return '';
|
2022-07-01 16:10:30 +03:00
|
|
|
}
|
2020-03-31 22:14:16 +03:00
|
|
|
|
2022-05-02 20:06:30 +03:00
|
|
|
private function getUseFullContent($rawContent)
|
2022-07-01 16:10:30 +03:00
|
|
|
{
|
2021-07-01 05:21:58 +03:00
|
|
|
$content = '';
|
2022-05-02 20:06:30 +03:00
|
|
|
foreach ($rawContent->children as $element) {
|
2022-07-01 16:10:30 +03:00
|
|
|
if (
|
2022-05-08 17:37:53 +03:00
|
|
|
($element->tag === 'p' || $element->tag === 'h3') &&
|
2022-05-02 20:06:30 +03:00
|
|
|
$element->class !== 'article__teaser'
|
2022-07-01 16:10:30 +03:00
|
|
|
) {
|
2021-07-01 05:21:58 +03:00
|
|
|
$content .= $element;
|
2022-05-02 20:06:30 +03:00
|
|
|
} elseif ($element->tag === 'main') {
|
2024-08-07 16:51:44 +03:00
|
|
|
$content .= $this->getUseFullContent($element->find('article', 0));
|
2022-05-02 20:06:30 +03:00
|
|
|
} elseif ($element->tag === 'header') {
|
2024-08-07 16:51:44 +03:00
|
|
|
$content .= $this->getUseFullContent($element);
|
2022-07-01 16:10:30 +03:00
|
|
|
} elseif (
|
2022-05-02 20:06:30 +03:00
|
|
|
$element->tag === 'div' &&
|
|
|
|
!str_contains($element->class, 'article__infobox') &&
|
|
|
|
!str_contains($element->class, 'authorinfo')
|
2022-07-01 16:10:30 +03:00
|
|
|
) {
|
2024-08-07 16:51:44 +03:00
|
|
|
$content .= $this->getUseFullContent($element);
|
2022-07-01 16:10:30 +03:00
|
|
|
} elseif (
|
2022-05-08 17:37:53 +03:00
|
|
|
$element->tag === 'section' &&
|
2022-06-04 21:50:16 +03:00
|
|
|
(str_contains($element->class, 'article__richtext') ||
|
|
|
|
str_contains($element->class, 'article__context'))
|
2022-07-01 16:10:30 +03:00
|
|
|
) {
|
2024-08-07 16:51:44 +03:00
|
|
|
$content .= $this->getUseFullContent($element);
|
2022-06-04 21:50:16 +03:00
|
|
|
} elseif ($element->tag === 'picture') {
|
2024-08-07 16:51:44 +03:00
|
|
|
$content .= $this->getValidImage($element);
|
2024-07-28 23:42:18 +03:00
|
|
|
} elseif ($element->tag === 'ul') {
|
|
|
|
$content .= $element;
|
2022-07-01 16:10:30 +03:00
|
|
|
}
|
|
|
|
}
|
2022-05-02 20:06:30 +03:00
|
|
|
return $content;
|
|
|
|
}
|
|
|
|
|
2021-07-01 05:21:58 +03:00
|
|
|
private function getTeaser($content)
|
|
|
|
{
|
|
|
|
$teaser = $content->find('p[class=article__teaser]', 0);
|
2022-05-02 20:06:30 +03:00
|
|
|
if ($teaser === null) {
|
2021-07-01 05:21:58 +03:00
|
|
|
return '';
|
2022-05-02 20:06:30 +03:00
|
|
|
}
|
|
|
|
$teaser = $teaser->plaintext;
|
|
|
|
$teaser = preg_replace('/[ ]{2,}/', ' ', $teaser);
|
|
|
|
$teaser = '<p class="article__teaser">' . $teaser . '</p>';
|
2021-07-01 05:21:58 +03:00
|
|
|
return $teaser;
|
2020-03-31 22:14:16 +03:00
|
|
|
}
|
|
|
|
|
2023-08-11 00:59:37 +03:00
|
|
|
private function getArticle($link)
|
2022-06-04 21:50:16 +03:00
|
|
|
{
|
|
|
|
$item = [];
|
|
|
|
$article = getSimpleHTMLDOM($link);
|
|
|
|
defaultLinkTo($article, self::URI);
|
|
|
|
$content = $article->find('article[id=article]', 0);
|
|
|
|
$item['uri'] = $link;
|
|
|
|
|
2022-05-02 20:06:30 +03:00
|
|
|
$author = $article->find('.article__author', 1);
|
2022-06-04 21:50:16 +03:00
|
|
|
if ($author !== null) {
|
2020-03-31 22:14:16 +03:00
|
|
|
$item['author'] = trim($author->plaintext);
|
2022-07-01 16:10:30 +03:00
|
|
|
}
|
2022-04-13 00:40:37 +03:00
|
|
|
|
2022-06-04 21:50:16 +03:00
|
|
|
$createdAt = $article->find('[class=article__release]', 0);
|
|
|
|
if ($createdAt) {
|
|
|
|
$item['timestamp'] = strtotime(str_replace('Uhr', '', $createdAt->plaintext));
|
2022-04-13 00:40:37 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if ($article->find('h2', 0) === null) {
|
|
|
|
$item['title'] = $article->find('h3', 0)->innertext;
|
|
|
|
} else {
|
|
|
|
$item['title'] = $article->find('h2', 0)->innertext;
|
2022-07-01 16:10:30 +03:00
|
|
|
}
|
2022-04-13 00:40:37 +03:00
|
|
|
$item['content'] = '';
|
|
|
|
|
2022-06-04 21:50:16 +03:00
|
|
|
if ($article->find('section[class*=article__richtext]', 0) === null) {
|
2021-08-25 16:08:23 +03:00
|
|
|
$content = $article->find('div[class*=modul__teaser]', 0)
|
|
|
|
->find('p', 0);
|
|
|
|
$item['content'] .= $content;
|
2022-07-01 16:10:30 +03:00
|
|
|
} else {
|
2021-08-25 16:08:23 +03:00
|
|
|
$content = $article->find('article', 0);
|
|
|
|
// change order of article teaser in order to show it on top
|
|
|
|
// of the title image. If we didn't do this some rss programs
|
|
|
|
// would show the subtitle of the title image as teaser instead
|
|
|
|
// of the actuall article teaser.
|
2024-08-07 16:51:44 +03:00
|
|
|
$item['content'] .= $this->getTeaser($content);
|
|
|
|
$item['content'] .= $this->getUseFullContent($content);
|
2022-07-01 16:10:30 +03:00
|
|
|
}
|
2020-03-31 22:14:16 +03:00
|
|
|
|
2024-11-23 21:19:20 +03:00
|
|
|
$categories = $article->find('[class=themen]', 0);
|
|
|
|
if ($categories) {
|
|
|
|
$item['categories'] = [];
|
|
|
|
foreach ($categories->find('a') as $category) {
|
|
|
|
$item['categories'][] = $category->innertext;
|
|
|
|
}
|
|
|
|
}
|
2021-07-01 05:21:58 +03:00
|
|
|
|
|
|
|
$article->clear();
|
2023-08-11 00:59:37 +03:00
|
|
|
return $item;
|
2020-03-31 22:14:16 +03:00
|
|
|
}
|
2021-07-01 05:21:58 +03:00
|
|
|
|
|
|
|
private function handleNewsblock($listSite)
|
2022-07-01 16:10:30 +03:00
|
|
|
{
|
2020-03-31 22:14:16 +03:00
|
|
|
$main = $listSite->find('main', 0);
|
|
|
|
foreach ($main->find('article') as $article) {
|
2022-04-13 00:40:37 +03:00
|
|
|
$url = $article->find('a', 0)->href;
|
|
|
|
$url = urljoin(self::URI, $url);
|
2022-10-08 16:34:26 +03:00
|
|
|
// exclude nn+ articles if desired
|
|
|
|
if (
|
2023-08-11 00:59:37 +03:00
|
|
|
$this->getInput('hideNNPlus') &&
|
|
|
|
str_contains($url, 'www.nn.de')
|
|
|
|
) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2024-08-07 16:51:44 +03:00
|
|
|
$item = $this->getArticle($url);
|
2023-08-11 00:59:37 +03:00
|
|
|
|
|
|
|
// exclude police reports if desired
|
|
|
|
if (
|
|
|
|
!$this->getInput('policeReports') &&
|
|
|
|
str_contains($item['content'], 'Hier geht es zu allen aktuellen Polizeimeldungen.')
|
2022-10-08 16:34:26 +03:00
|
|
|
) {
|
2023-08-11 00:59:37 +03:00
|
|
|
continue;
|
2022-10-08 16:34:26 +03:00
|
|
|
}
|
2023-08-11 00:59:37 +03:00
|
|
|
|
|
|
|
// exclude dpa articles
|
|
|
|
if (
|
|
|
|
$this->getInput('hideDPA') &&
|
|
|
|
str_contains($item['author'], 'dpa')
|
|
|
|
) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
$this->items[] = $item;
|
2020-03-31 22:14:16 +03:00
|
|
|
}
|
2022-07-01 16:10:30 +03:00
|
|
|
}
|
2020-03-31 22:14:16 +03:00
|
|
|
}
|