rss-bridge/bridges/SchweinfurtBuergerinformationenBridge.php

<?php

class SchweinfurtBuergerinformationenBridge extends BridgeAbstract
{
    const MAINTAINER = 'mibe';
    const NAME = 'Schweinfurt Bürgerinformationen';
    const URI = 'https://www.schweinfurt.de/rathaus-politik/pressestelle/buergerinformationen/index.html';
    const ARTICLE_URI = 'https://www.schweinfurt.de/rathaus-politik/pressestelle/buergerinformationen/%d.html';
    const INDEX_CACHE_TIMEOUT = 10800; // 3h
    const ARTICLE_CACHE_TIMEOUT = 21600; // 6h
    const DESCRIPTION = 'Returns the latest news for citizens of Schweinfurt';
    const PARAMETERS = [
        [
            'pages' => [
                'name' => 'Number of pages',
                'type' => 'number',
                'title' => 'Specifies the number of pages to fetch. Usually one or two are enough.',
                'exampleValue' => '1',
                'defaultValue' => '1',
            ]
        ]
    ];

    public function getIcon()
    {
        return 'https://www.schweinfurt.de/__/images/favicon.ico';
    }

    public function collectData()
    {
        // Get number of pages to retrieve. One page is the minimum.
        $pages = $this->getInput('pages');
        if (!is_int($pages) || $pages < 1) {
            $pages = 1;
        }

        $articleIDs = [];

        for ($page = 0; $page < $pages; $page++) {
            $newIDs = $this->getArticleIDsFromPage($page);
            $articleIDs = array_merge($articleIDs, $newIDs);
        }

        foreach ($articleIDs as $articleID) {
            $this->items[] = $this->generateItemFromArticle($articleID);
        }
    }

    private function getArticleIDsFromPage($page)
    {
        $url = sprintf(self::URI . '?art_pager=%d', $page);
        $html = getSimpleHTMLDOMCached($url, self::INDEX_CACHE_TIMEOUT)
            or returnServerError('Could not retrieve ' . $url);

        $articles = $html->find('div.artikel-uebersicht');
        $articleIDs = [];

        foreach ($articles as $article) {
            // The article ID is in the 'id' attribute of the div element, prefixed with 'artikel_id_'
            if (preg_match('/artikel_id_(\d+)/', $article->id, $match)) {
                $articleIDs[] = $match[1];
            } else {
                returnServerError('Couldn\'t determine article ID from index page.');
            }
        }

        return $articleIDs;
    }

    private function generateItemFromArticle($id)
    {
        $url = sprintf(self::ARTICLE_URI, $id);
        $html = getSimpleHTMLDOMCached($url, self::ARTICLE_CACHE_TIMEOUT)
            or returnServerError('Could not retrieve ' . $url);

        $div = $html->find('div#artikel-detail', 0);
        $divContent = $div->find('.c-content', 0);
        $images = $divContent->find('img');

        // Every external link has a little arrow symbol image attached to it.
        // Remove this image. This has to be done before building $content.
        foreach ($images as $image) {
            if ($image->class == 'imgextlink') {
                $image->outertext = '';
            }
        }

        $title = $div->find('.c-title', 0)->innertext;
        $teaser = $div->find('.c-teaser', 0)->innertext;
        $content = $divContent->innertext;

        // The title can contain HTML entities. These can be converted back
        // to regular UTF-8 characters.
        $title = html_entity_decode($title, ENT_HTML5, 'UTF-8');

        // If there's a teaser, make it more eye-catching,
        // so that it is clear, that this is not part of the actual content.
        if (strlen(trim($teaser)) > 0) {
            $content = '<i><strong>' . $teaser . '</strong></i>' . $content;
        }

        $item = [
            'uri' => $url,
            'title' => $title,
            'content' => $content,
            'uid' => $id,
            ];

        // Let's see if there are images in the content, and if yes, attach
        // them as enclosures, but not images which are used for linking to an external site and data URIs.
        foreach ($images as $image) {
            if ($image->class != 'imgextlink' && parse_url($image->src, PHP_URL_SCHEME) != 'data') {
                $item['enclosures'][] = $image->src;
            }
        }

        // Get the date of the article. Example: "zuletzt geändert: 26.05.2020"
        $editDate = $div->find('div#edit', 0)->plaintext;
        $editDate = substr($editDate, strrpos($editDate, ' ') + 1);
        $editDate = DateTime::createFromFormat('d.m.Y', $editDate);

        if ($editDate !== false) {
            $item['timestamp'] = $editDate->getTimestamp();
        }

        return $item;
    }
}
[SchweinfurtBuergerinformationenBridge] Add new bridge (#1610) 2022-03-25 00:37:44 +03:00			`<?php`
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00
			`class SchweinfurtBuergerinformationenBridge extends BridgeAbstract`
			`{`
			`const MAINTAINER = 'mibe';`
			`const NAME = 'Schweinfurt Bürgerinformationen';`
			`const URI = 'https://www.schweinfurt.de/rathaus-politik/pressestelle/buergerinformationen/index.html';`
			`const ARTICLE_URI = 'https://www.schweinfurt.de/rathaus-politik/pressestelle/buergerinformationen/%d.html';`
			`const INDEX_CACHE_TIMEOUT = 10800; // 3h`
			`const ARTICLE_CACHE_TIMEOUT = 21600; // 6h`
			`const DESCRIPTION = 'Returns the latest news for citizens of Schweinfurt';`
			`const PARAMETERS = [`
			`[`
			`'pages' => [`
			`'name' => 'Number of pages',`
			`'type' => 'number',`
			`'title' => 'Specifies the number of pages to fetch. Usually one or two are enough.',`
			`'exampleValue' => '1',`
			`'defaultValue' => '1',`
			`]`
			`]`
			`];`

			`public function getIcon()`
			`{`
			`return 'https://www.schweinfurt.de/__/images/favicon.ico';`
			`}`

			`public function collectData()`
			`{`
			`// Get number of pages to retrieve. One page is the minimum.`
			`$pages = $this->getInput('pages');`
			`if (!is_int($pages) \|\| $pages < 1) {`
			`$pages = 1;`
			`}`

			`$articleIDs = [];`

			`for ($page = 0; $page < $pages; $page++) {`
			`$newIDs = $this->getArticleIDsFromPage($page);`
			`$articleIDs = array_merge($articleIDs, $newIDs);`
			`}`

			`foreach ($articleIDs as $articleID) {`
			`$this->items[] = $this->generateItemFromArticle($articleID);`
			`}`
			`}`

			`private function getArticleIDsFromPage($page)`
			`{`
			`$url = sprintf(self::URI . '?art_pager=%d', $page);`
			`$html = getSimpleHTMLDOMCached($url, self::INDEX_CACHE_TIMEOUT)`
			`or returnServerError('Could not retrieve ' . $url);`

			`$articles = $html->find('div.artikel-uebersicht');`
			`$articleIDs = [];`

			`foreach ($articles as $article) {`
			`// The article ID is in the 'id' attribute of the div element, prefixed with 'artikel_id_'`
			`if (preg_match('/artikel_id_(\d+)/', $article->id, $match)) {`
			`$articleIDs[] = $match[1];`
			`} else {`
			`returnServerError('Couldn\'t determine article ID from index page.');`
			`}`
			`}`

			`return $articleIDs;`
			`}`

			`private function generateItemFromArticle($id)`
			`{`
			`$url = sprintf(self::ARTICLE_URI, $id);`
			`$html = getSimpleHTMLDOMCached($url, self::ARTICLE_CACHE_TIMEOUT)`
			`or returnServerError('Could not retrieve ' . $url);`

			`$div = $html->find('div#artikel-detail', 0);`
			`$divContent = $div->find('.c-content', 0);`
			`$images = $divContent->find('img');`

			`// Every external link has a little arrow symbol image attached to it.`
			`// Remove this image. This has to be done before building $content.`
			`foreach ($images as $image) {`
			`if ($image->class == 'imgextlink') {`
			`$image->outertext = '';`
			`}`
			`}`

			`$title = $div->find('.c-title', 0)->innertext;`
			`$teaser = $div->find('.c-teaser', 0)->innertext;`
			`$content = $divContent->innertext;`

			`// The title can contain HTML entities. These can be converted back`
			`// to regular UTF-8 characters.`
			`$title = html_entity_decode($title, ENT_HTML5, 'UTF-8');`

			`// If there's a teaser, make it more eye-catching,`
			`// so that it is clear, that this is not part of the actual content.`
			`if (strlen(trim($teaser)) > 0) {`
			`$content = '<i><strong>' . $teaser . '</strong></i>' . $content;`
			`}`

			`$item = [`
			`'uri' => $url,`
			`'title' => $title,`
			`'content' => $content,`
			`'uid' => $id,`
			`];`

			`// Let's see if there are images in the content, and if yes, attach`
[SchweinfurtBuergerinformationenBridge] Don't include images with data URIs as enclosures. (#3811) See also setEnclosures() in FeedItem.php: URIs with a path are required. 2023-11-30 19:52:51 +03:00			`// them as enclosures, but not images which are used for linking to an external site and data URIs.`
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`foreach ($images as $image) {`
[SchweinfurtBuergerinformationenBridge] Don't include images with data URIs as enclosures. (#3811) See also setEnclosures() in FeedItem.php: URIs with a path are required. 2023-11-30 19:52:51 +03:00			`if ($image->class != 'imgextlink' && parse_url($image->src, PHP_URL_SCHEME) != 'data') {`
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`$item['enclosures'][] = $image->src;`
			`}`
			`}`

			`// Get the date of the article. Example: "zuletzt geändert: 26.05.2020"`
			`$editDate = $div->find('div#edit', 0)->plaintext;`
			`$editDate = substr($editDate, strrpos($editDate, ' ') + 1);`
			`$editDate = DateTime::createFromFormat('d.m.Y', $editDate);`

			`if ($editDate !== false) {`
			`$item['timestamp'] = $editDate->getTimestamp();`
			`}`

			`return $item;`
			`}`
[SchweinfurtBuergerinformationenBridge] Add new bridge (#1610) 2022-03-25 00:37:44 +03:00			`}`