diff --git a/bridges/TheHackerNewsBridge.php b/bridges/TheHackerNewsBridge.php index 0636bb46..1f9c34c0 100644 --- a/bridges/TheHackerNewsBridge.php +++ b/bridges/TheHackerNewsBridge.php @@ -10,6 +10,8 @@ class TheHackerNewsBridge extends BridgeAbstract public function collectData() { $html = getSimpleHTMLDOM($this->getURI()); + $html = convertLazyLoading($html); + $html = defaultLinkTo($html, $this->getURI()); $limit = 0; foreach ($html->find('div.body-post') as $element) { @@ -17,74 +19,68 @@ class TheHackerNewsBridge extends BridgeAbstract break; } + // Author (not present on home page) $article_author = null; - $icon_user = $element->find('i.icon-user', 0); - if ($icon_user) { - $article_author = trim($icon_user->parent()->plaintext); - $article_author = str_replace('', '', $article_author); - } + + // Title $article_title = $element->find('h2.home-title', 0)->plaintext; + // Date $article_timestamp = time(); - //Date without time $calendar = $element->find('i.icon-calendar', 0); if ($calendar) { $article_timestamp = strtotime( extractFromDelimiters( $calendar->parent()->outertext, '', - '' + '' ) ); } - //Article thumbnail in lazy-loading image - if (is_object($element->find('img[data-echo]', 0))) { - $article_thumbnail = [ - extractFromDelimiters( - $element->find('img[data-echo]', 0)->outertext, - "data-echo='", - "'" - ) - ]; - } else { - $article_thumbnail = []; + // Thumbnail + $article_thumbnail = []; + if (is_object($element->find('img', 0))) { + $article_thumbnail = [ $element->find('img', 0)->src ]; } + // Content (truncated) + $article_content = $element->find('div.home-desc', 0)->plaintext; + + // Now try expanding article $article_url = $element->find('a.story-link', 0)->href; - $article = getSimpleHTMLDOMCached($article_url); - if ($article) { - //Article body - $var = $article->find('div.articlebody', 0); - if ($var) { - $contents = $var->innertext; - $contents = stripRecursiveHtmlSection($contents, 'div', '
'); - $contents = stripWithDelimiters($contents, ''); + $article_html = getSimpleHTMLDOMCached($article_url); + if ($article_html) { + // Content (expanded and cleaned) + $article_body = $article_html->find('div.articlebody', 0); + if ($article_body) { + $article_body = convertLazyLoading($article_body); + $article_body = defaultLinkTo($article_body, $article_url); + $header_img = $article_body->find('img', 0); + if ($header_img) { + $header_img->parent->style = ''; + } + foreach ($article_body->find('center.cf') as $center_ad) { + $center_ad->outertext = ''; + } + $article_content = $article_body->innertext; } - //Date with time - if (is_object($article->find('meta[itemprop=dateModified]', 0))) { - $article_timestamp = strtotime( - extractFromDelimiters( - $article->find('meta[itemprop=dateModified]', 0)->outertext, - "content='", - "'" - ) - ); + // Author + $spans_author = $article_html->find('span.author'); + if (count($spans_author) > 0) { + $article_author = $spans_author[array_key_last($spans_author)]->plaintext; } - } else { - $contents = 'Could not request TheHackerNews: ' . $article_url; } $item = []; $item['uri'] = $article_url; $item['title'] = $article_title; - if ($article_author) { + if (!empty($article_author)) { $item['author'] = $article_author; } $item['enclosures'] = $article_thumbnail; $item['timestamp'] = $article_timestamp; - $item['content'] = trim($contents ?? ''); + $item['content'] = trim($article_content); $this->items[] = $item; $limit++; }