[ 'name' => 'Category', 'type' => 'list', 'values' => [ 'Alle News' => 'https://rss.golem.de/rss.php?feed=ATOM1.0', 'Audio/Video' => 'https://rss.golem.de/rss.php?ms=audio-video&feed=ATOM1.0', 'Auto' => 'https://rss.golem.de/rss.php?ms=auto&feed=ATOM1.0', 'Foto' => 'https://rss.golem.de/rss.php?ms=foto&feed=ATOM1.0', 'Games' => 'https://rss.golem.de/rss.php?ms=games&feed=ATOM1.0', 'Handy' => 'https://rss.golem.de/rss.php?ms=handy&feed=ATOM1.0', 'Internet' => 'https://rss.golem.de/rss.php?ms=internet&feed=ATOM1.0', 'Mobil' => 'https://rss.golem.de/rss.php?ms=mobil&feed=ATOM1.0', 'Open Source' => 'https://rss.golem.de/rss.php?ms=open-source&feed=ATOM1.0', 'Politik/Recht' => 'https://rss.golem.de/rss.php?ms=politik-recht&feed=ATOM1.0', 'Security' => 'https://rss.golem.de/rss.php?ms=security&feed=ATOM1.0', 'Desktop-Applikationen' => 'https://rss.golem.de/rss.php?ms=desktop-applikationen&feed=ATOM1.0', 'Software-Entwicklung' => 'https://rss.golem.de/rss.php?ms=softwareentwicklung&feed=ATOM1.0', 'Wirtschaft' => 'https://rss.golem.de/rss.php?ms=wirtschaft&feed=ATOM1.0', 'Wissenschaft' => 'https://rss.golem.de/rss.php?ms=wissenschaft&feed=ATOM1.0' ] ], 'limit' => [ 'name' => 'Limit', 'type' => 'number', 'required' => false, 'title' => 'Specify number of full articles to return', 'defaultValue' => 5 ] ]]; const LIMIT = 5; const HEADERS = ['Cookie: golem_consent20=simple|220101;']; public function collectData() { $this->collectExpandableDatas( $this->getInput('category'), $this->getInput('limit') ?: static::LIMIT ); } protected function parseItem(array $item) { $item['content'] ??= ''; $uri = $item['uri']; $urls = []; while ($uri) { if (isset($urls[$uri])) { // Prevent forever a loop break; } $urls[$uri] = true; $articlePage = getSimpleHTMLDOMCached($uri, static::CACHE_TIMEOUT, static::HEADERS); // URI without RSS feed reference $item['uri'] = $articlePage->find('head meta[name="twitter:url"]', 0)->content; $categories = $articlePage->find('ul.tags__list li'); foreach ($categories as $category) { $trimmedcategories[] = trim(html_entity_decode($category->plaintext)); } if (isset($trimmedcategories)) { $item['categories'] = array_unique($trimmedcategories); } $item['content'] .= $this->extractContent($articlePage); // next page $nextUri = $articlePage->find('link[rel="next"]', 0); $uri = $nextUri ? static::URI . $nextUri->href : null; } return $item; } private function extractContent($page) { $item = ''; $article = $page->find('article', 0); //built youtube iframes foreach ($article->find('.embedcontent') as &$embedcontent) { $ytscript = $embedcontent->find('script', 0); if (preg_match('/(www.youtube.com.*?)\"/', $ytscript->innertext, $link)) { $link = 'https://' . str_replace('\\', '', $link[1]); $embedcontent->innertext .= <<'; EOT; } } //built golem videos foreach ($article->find('.gvideofig') as &$embedcontent) { if (preg_match('/gvideo_(.*)/', $embedcontent->id, $videoid)) { $embedcontent->innertext .= << EOT; } } // delete known bad elements foreach ( $article->find('div[id*="adtile"], #job-market, #seminars, iframe, div.gbox_affiliate, div.toc') as $bad ) { $bad->remove(); } // reload html, as remove() is buggy $article = str_get_html($article->outertext); $header = $article->find('header', 0); foreach ($header->find('p, figure') as $element) { $item .= $element; } $content = $article->find('div.formatted', 0); // full image quality foreach ($content->find('img[data-src-full][src*="."]') as $img) { $img->src = $img->getAttribute('data-src-full'); } foreach ($content->find('p, h1, h2, h3, img[src*="."], iframe, video') as $element) { $item .= $element; } return $item; } }