[ 'name' => 'Number of pages', 'type' => 'number', 'title' => 'Specifies the number of pages to fetch. Usually one or two are enough.', 'exampleValue' => '1', 'defaultValue' => '1', ] ] ]; public function getIcon() { return 'https://www.schweinfurt.de/__/images/favicon.ico'; } public function collectData() { // Get number of pages to retrieve. One page is the minimum. $pages = $this->getInput('pages'); if (!is_int($pages) || $pages < 1) { $pages = 1; } $articleIDs = []; for ($page = 0; $page < $pages; $page++) { $newIDs = $this->getArticleIDsFromPage($page); $articleIDs = array_merge($articleIDs, $newIDs); } foreach ($articleIDs as $articleID) { $this->items[] = $this->generateItemFromArticle($articleID); } } private function getArticleIDsFromPage($page) { $url = sprintf(self::URI . '?art_pager=%d', $page); $html = getSimpleHTMLDOMCached($url, self::INDEX_CACHE_TIMEOUT); $articles = $html->find('div.artikel-uebersicht'); $articleIDs = []; foreach ($articles as $article) { // The article ID is in the 'id' attribute of the div element, prefixed with 'artikel_id_' if (preg_match('/artikel_id_(\d+)/', $article->id, $match)) { $articleIDs[] = $match[1]; } else { returnServerError('Couldn\'t determine article ID from index page.'); } } return $articleIDs; } private function generateItemFromArticle($id) { $url = sprintf(self::ARTICLE_URI, $id); $html = getSimpleHTMLDOMCached($url, self::ARTICLE_CACHE_TIMEOUT); $div = $html->find('div#artikel-detail', 0); $divContent = $div->find('.c-content', 0); $images = $divContent->find('img'); // Every external link has a little arrow symbol image attached to it. // Remove this image. This has to be done before building $content. foreach ($images as $image) { if ($image->class == 'imgextlink') { $image->outertext = ''; } } $title = $div->find('.c-title', 0)->innertext; $teaser = $div->find('.c-teaser', 0)->innertext; $content = $divContent->innertext; // The title can contain HTML entities. These can be converted back // to regular UTF-8 characters. $title = html_entity_decode($title, ENT_HTML5, 'UTF-8'); // If there's a teaser, make it more eye-catching, // so that it is clear, that this is not part of the actual content. if (strlen(trim($teaser)) > 0) { $content = '' . $teaser . '' . $content; } $item = [ 'uri' => $url, 'title' => $title, 'content' => $content, 'uid' => $id, ]; // Let's see if there are images in the content, and if yes, attach // them as enclosures, but not images which are used for linking to an external site and data URIs. foreach ($images as $image) { if ($image->class != 'imgextlink' && parse_url($image->src, PHP_URL_SCHEME) != 'data') { $item['enclosures'][] = $image->src; } } // Get the date of the article. Example: "zuletzt geändert: 26.05.2020" $editDate = $div->find('div#edit', 0)->plaintext; $editDate = substr($editDate, strrpos($editDate, ' ') + 1); $editDate = DateTime::createFromFormat('d.m.Y', $editDate); if ($editDate !== false) { $item['timestamp'] = $editDate->getTimestamp(); } return $item; } }