rss-bridge/bridges/SchweinfurtBuergerinformationenBridge.php

129 lines
4.5 KiB
PHP
Raw Normal View History

<?php
class SchweinfurtBuergerinformationenBridge extends BridgeAbstract
{
const MAINTAINER = 'mibe';
const NAME = 'Schweinfurt Bürgerinformationen';
const URI = 'https://www.schweinfurt.de/rathaus-politik/pressestelle/buergerinformationen/index.html';
const ARTICLE_URI = 'https://www.schweinfurt.de/rathaus-politik/pressestelle/buergerinformationen/%d.html';
const INDEX_CACHE_TIMEOUT = 10800; // 3h
const ARTICLE_CACHE_TIMEOUT = 21600; // 6h
const DESCRIPTION = 'Returns the latest news for citizens of Schweinfurt';
const PARAMETERS = [
[
'pages' => [
'name' => 'Number of pages',
'type' => 'number',
'title' => 'Specifies the number of pages to fetch. Usually one or two are enough.',
'exampleValue' => '1',
'defaultValue' => '1',
]
]
];
public function getIcon()
{
return 'https://www.schweinfurt.de/__/images/favicon.ico';
}
public function collectData()
{
// Get number of pages to retrieve. One page is the minimum.
$pages = $this->getInput('pages');
if (!is_int($pages) || $pages < 1) {
$pages = 1;
}
$articleIDs = [];
for ($page = 0; $page < $pages; $page++) {
$newIDs = $this->getArticleIDsFromPage($page);
$articleIDs = array_merge($articleIDs, $newIDs);
}
foreach ($articleIDs as $articleID) {
$this->items[] = $this->generateItemFromArticle($articleID);
}
}
private function getArticleIDsFromPage($page)
{
$url = sprintf(self::URI . '?art_pager=%d', $page);
$html = getSimpleHTMLDOMCached($url, self::INDEX_CACHE_TIMEOUT)
or returnServerError('Could not retrieve ' . $url);
$articles = $html->find('div.artikel-uebersicht');
$articleIDs = [];
foreach ($articles as $article) {
// The article ID is in the 'id' attribute of the div element, prefixed with 'artikel_id_'
if (preg_match('/artikel_id_(\d+)/', $article->id, $match)) {
$articleIDs[] = $match[1];
} else {
returnServerError('Couldn\'t determine article ID from index page.');
}
}
return $articleIDs;
}
private function generateItemFromArticle($id)
{
$url = sprintf(self::ARTICLE_URI, $id);
$html = getSimpleHTMLDOMCached($url, self::ARTICLE_CACHE_TIMEOUT)
or returnServerError('Could not retrieve ' . $url);
$div = $html->find('div#artikel-detail', 0);
$divContent = $div->find('.c-content', 0);
$images = $divContent->find('img');
// Every external link has a little arrow symbol image attached to it.
// Remove this image. This has to be done before building $content.
foreach ($images as $image) {
if ($image->class == 'imgextlink') {
$image->outertext = '';
}
}
$title = $div->find('.c-title', 0)->innertext;
$teaser = $div->find('.c-teaser', 0)->innertext;
$content = $divContent->innertext;
// The title can contain HTML entities. These can be converted back
// to regular UTF-8 characters.
$title = html_entity_decode($title, ENT_HTML5, 'UTF-8');
// If there's a teaser, make it more eye-catching,
// so that it is clear, that this is not part of the actual content.
if (strlen(trim($teaser)) > 0) {
$content = '<i><strong>' . $teaser . '</strong></i>' . $content;
}
$item = [
'uri' => $url,
'title' => $title,
'content' => $content,
'uid' => $id,
];
// Let's see if there are images in the content, and if yes, attach
// them as enclosures, but not images which are used for linking to an external site and data URIs.
foreach ($images as $image) {
if ($image->class != 'imgextlink' && parse_url($image->src, PHP_URL_SCHEME) != 'data') {
$item['enclosures'][] = $image->src;
}
}
// Get the date of the article. Example: "zuletzt geändert: 26.05.2020"
$editDate = $div->find('div#edit', 0)->plaintext;
$editDate = substr($editDate, strrpos($editDate, ' ') + 1);
$editDate = DateTime::createFromFormat('d.m.Y', $editDate);
if ($editDate !== false) {
$item['timestamp'] = $editDate->getTimestamp();
}
return $item;
}
}