mirror of
https://github.com/RSS-Bridge/rss-bridge.git
synced 2024-11-21 17:15:25 +03:00
ccc20849ff
See also setEnclosures() in FeedItem.php: URIs with a path are required.
128 lines
4.5 KiB
PHP
128 lines
4.5 KiB
PHP
<?php
|
|
|
|
class SchweinfurtBuergerinformationenBridge extends BridgeAbstract
|
|
{
|
|
const MAINTAINER = 'mibe';
|
|
const NAME = 'Schweinfurt Bürgerinformationen';
|
|
const URI = 'https://www.schweinfurt.de/rathaus-politik/pressestelle/buergerinformationen/index.html';
|
|
const ARTICLE_URI = 'https://www.schweinfurt.de/rathaus-politik/pressestelle/buergerinformationen/%d.html';
|
|
const INDEX_CACHE_TIMEOUT = 10800; // 3h
|
|
const ARTICLE_CACHE_TIMEOUT = 21600; // 6h
|
|
const DESCRIPTION = 'Returns the latest news for citizens of Schweinfurt';
|
|
const PARAMETERS = [
|
|
[
|
|
'pages' => [
|
|
'name' => 'Number of pages',
|
|
'type' => 'number',
|
|
'title' => 'Specifies the number of pages to fetch. Usually one or two are enough.',
|
|
'exampleValue' => '1',
|
|
'defaultValue' => '1',
|
|
]
|
|
]
|
|
];
|
|
|
|
public function getIcon()
|
|
{
|
|
return 'https://www.schweinfurt.de/__/images/favicon.ico';
|
|
}
|
|
|
|
public function collectData()
|
|
{
|
|
// Get number of pages to retrieve. One page is the minimum.
|
|
$pages = $this->getInput('pages');
|
|
if (!is_int($pages) || $pages < 1) {
|
|
$pages = 1;
|
|
}
|
|
|
|
$articleIDs = [];
|
|
|
|
for ($page = 0; $page < $pages; $page++) {
|
|
$newIDs = $this->getArticleIDsFromPage($page);
|
|
$articleIDs = array_merge($articleIDs, $newIDs);
|
|
}
|
|
|
|
foreach ($articleIDs as $articleID) {
|
|
$this->items[] = $this->generateItemFromArticle($articleID);
|
|
}
|
|
}
|
|
|
|
private function getArticleIDsFromPage($page)
|
|
{
|
|
$url = sprintf(self::URI . '?art_pager=%d', $page);
|
|
$html = getSimpleHTMLDOMCached($url, self::INDEX_CACHE_TIMEOUT)
|
|
or returnServerError('Could not retrieve ' . $url);
|
|
|
|
$articles = $html->find('div.artikel-uebersicht');
|
|
$articleIDs = [];
|
|
|
|
foreach ($articles as $article) {
|
|
// The article ID is in the 'id' attribute of the div element, prefixed with 'artikel_id_'
|
|
if (preg_match('/artikel_id_(\d+)/', $article->id, $match)) {
|
|
$articleIDs[] = $match[1];
|
|
} else {
|
|
returnServerError('Couldn\'t determine article ID from index page.');
|
|
}
|
|
}
|
|
|
|
return $articleIDs;
|
|
}
|
|
|
|
private function generateItemFromArticle($id)
|
|
{
|
|
$url = sprintf(self::ARTICLE_URI, $id);
|
|
$html = getSimpleHTMLDOMCached($url, self::ARTICLE_CACHE_TIMEOUT)
|
|
or returnServerError('Could not retrieve ' . $url);
|
|
|
|
$div = $html->find('div#artikel-detail', 0);
|
|
$divContent = $div->find('.c-content', 0);
|
|
$images = $divContent->find('img');
|
|
|
|
// Every external link has a little arrow symbol image attached to it.
|
|
// Remove this image. This has to be done before building $content.
|
|
foreach ($images as $image) {
|
|
if ($image->class == 'imgextlink') {
|
|
$image->outertext = '';
|
|
}
|
|
}
|
|
|
|
$title = $div->find('.c-title', 0)->innertext;
|
|
$teaser = $div->find('.c-teaser', 0)->innertext;
|
|
$content = $divContent->innertext;
|
|
|
|
// The title can contain HTML entities. These can be converted back
|
|
// to regular UTF-8 characters.
|
|
$title = html_entity_decode($title, ENT_HTML5, 'UTF-8');
|
|
|
|
// If there's a teaser, make it more eye-catching,
|
|
// so that it is clear, that this is not part of the actual content.
|
|
if (strlen(trim($teaser)) > 0) {
|
|
$content = '<i><strong>' . $teaser . '</strong></i>' . $content;
|
|
}
|
|
|
|
$item = [
|
|
'uri' => $url,
|
|
'title' => $title,
|
|
'content' => $content,
|
|
'uid' => $id,
|
|
];
|
|
|
|
// Let's see if there are images in the content, and if yes, attach
|
|
// them as enclosures, but not images which are used for linking to an external site and data URIs.
|
|
foreach ($images as $image) {
|
|
if ($image->class != 'imgextlink' && parse_url($image->src, PHP_URL_SCHEME) != 'data') {
|
|
$item['enclosures'][] = $image->src;
|
|
}
|
|
}
|
|
|
|
// Get the date of the article. Example: "zuletzt geändert: 26.05.2020"
|
|
$editDate = $div->find('div#edit', 0)->plaintext;
|
|
$editDate = substr($editDate, strrpos($editDate, ' ') + 1);
|
|
$editDate = DateTime::createFromFormat('d.m.Y', $editDate);
|
|
|
|
if ($editDate !== false) {
|
|
$item['timestamp'] = $editDate->getTimestamp();
|
|
}
|
|
|
|
return $item;
|
|
}
|
|
}
|