[ 'name' => 'Site section', 'type' => 'list', 'defaultValue' => 'index', 'values' => [ 'All' => 'index', 'Apple' => 'apple', 'Board Games' => 'cardboard', 'Cars' => 'cars', 'Features' => 'features', 'Gaming' => 'gaming', 'Information Technology' => 'technology-lab', 'Science' => 'science', 'Staff Blogs' => 'staff-blogs', 'Tech Policy' => 'tech-policy', 'Tech' => 'gadgets', ] ] ]]; public function collectData() { $url = 'https://feeds.arstechnica.com/arstechnica/' . $this->getInput('section'); $this->collectExpandableDatas($url, 10); } protected function parseItem(array $item) { $item_html = getSimpleHTMLDOMCached($item['uri']); $item_html = defaultLinkTo($item_html, self::URI); $content = ''; $header = $item_html->find('article header', 0); $leading = $header->find('p[class*=leading]', 0); if ($leading != null) { $content .= '
' . $leading->innertext . '
'; } $intro_image = $header->find('img.intro-image', 0); if ($intro_image != null) { $content .= ''; } foreach ($item_html->find('.post-content') as $content_tag) { $content .= $content_tag->innertext; } $item['content'] = str_get_html($content); $parsely = $item_html->find('[name="parsely-page"]', 0); $parsely_json = json_decode(html_entity_decode($parsely->content), true); $item['categories'] = $parsely_json['tags']; // Some lightboxes are nested in figures. I'd guess that's a // bug in the website foreach ($item['content']->find('figure div div.ars-lightbox') as $weird_lightbox) { $weird_lightbox->parent->parent->outertext = $weird_lightbox; } // It's easier to reconstruct the whole thing than remove // duplicate reactive tags foreach ($item['content']->find('.ars-lightbox') as $lightbox) { $lightbox_content = ''; foreach ($lightbox->find('.ars-lightbox-item') as $lightbox_item) { $img = $lightbox_item->find('img', 0); if ($img != null) { $lightbox_content .= ''; } } $lightbox->innertext = $lightbox_content; } // remove various ars advertising foreach ($item['content']->find('.ars-interlude-container') as $ad) { $ad->remove(); } foreach ($item['content']->find('.toc-container') as $toc) { $toc->remove(); } // Mostly YouTube videos $iframes = $item['content']->find('iframe'); foreach ($iframes as $iframe) { $iframe->outertext = '' . $iframe->src . ''; } // This fixed padding around the former iframes and actual inline videos foreach ($item['content']->find('div[style*=aspect-ratio]') as $styled) { $styled->removeAttribute('style'); } $item['content'] = backgroundToImg($item['content']); $item['uid'] = strval($parsely_json['post_id']); return $item; } }