[TheHackerNews] Update content extraction (#3458)

This commit is contained in:
ORelio 2023-06-25 19:01:57 +02:00 committed by GitHub
parent 1b02d4f49b
commit d4bc63ee98
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -10,6 +10,8 @@ class TheHackerNewsBridge extends BridgeAbstract
public function collectData() public function collectData()
{ {
$html = getSimpleHTMLDOM($this->getURI()); $html = getSimpleHTMLDOM($this->getURI());
$html = convertLazyLoading($html);
$html = defaultLinkTo($html, $this->getURI());
$limit = 0; $limit = 0;
foreach ($html->find('div.body-post') as $element) { foreach ($html->find('div.body-post') as $element) {
@ -17,74 +19,68 @@ class TheHackerNewsBridge extends BridgeAbstract
break; break;
} }
// Author (not present on home page)
$article_author = null; $article_author = null;
$icon_user = $element->find('i.icon-user', 0);
if ($icon_user) { // Title
$article_author = trim($icon_user->parent()->plaintext);
$article_author = str_replace('', '', $article_author);
}
$article_title = $element->find('h2.home-title', 0)->plaintext; $article_title = $element->find('h2.home-title', 0)->plaintext;
// Date
$article_timestamp = time(); $article_timestamp = time();
//Date without time
$calendar = $element->find('i.icon-calendar', 0); $calendar = $element->find('i.icon-calendar', 0);
if ($calendar) { if ($calendar) {
$article_timestamp = strtotime( $article_timestamp = strtotime(
extractFromDelimiters( extractFromDelimiters(
$calendar->parent()->outertext, $calendar->parent()->outertext,
'</i>', '</i>',
'<span>' '</span>'
) )
); );
} }
//Article thumbnail in lazy-loading image // Thumbnail
if (is_object($element->find('img[data-echo]', 0))) { $article_thumbnail = [];
$article_thumbnail = [ if (is_object($element->find('img', 0))) {
extractFromDelimiters( $article_thumbnail = [ $element->find('img', 0)->src ];
$element->find('img[data-echo]', 0)->outertext,
"data-echo='",
"'"
)
];
} else {
$article_thumbnail = [];
} }
// Content (truncated)
$article_content = $element->find('div.home-desc', 0)->plaintext;
// Now try expanding article
$article_url = $element->find('a.story-link', 0)->href; $article_url = $element->find('a.story-link', 0)->href;
$article = getSimpleHTMLDOMCached($article_url); $article_html = getSimpleHTMLDOMCached($article_url);
if ($article) { if ($article_html) {
//Article body // Content (expanded and cleaned)
$var = $article->find('div.articlebody', 0); $article_body = $article_html->find('div.articlebody', 0);
if ($var) { if ($article_body) {
$contents = $var->innertext; $article_body = convertLazyLoading($article_body);
$contents = stripRecursiveHtmlSection($contents, 'div', '<div class="ad_'); $article_body = defaultLinkTo($article_body, $article_url);
$contents = stripWithDelimiters($contents, 'id="google_ads', '</iframe>'); $header_img = $article_body->find('img', 0);
$contents = stripWithDelimiters($contents, '<script', '</script>'); if ($header_img) {
$header_img->parent->style = '';
}
foreach ($article_body->find('center.cf') as $center_ad) {
$center_ad->outertext = '';
}
$article_content = $article_body->innertext;
} }
//Date with time // Author
if (is_object($article->find('meta[itemprop=dateModified]', 0))) { $spans_author = $article_html->find('span.author');
$article_timestamp = strtotime( if (count($spans_author) > 0) {
extractFromDelimiters( $article_author = $spans_author[array_key_last($spans_author)]->plaintext;
$article->find('meta[itemprop=dateModified]', 0)->outertext,
"content='",
"'"
)
);
} }
} else {
$contents = 'Could not request TheHackerNews: ' . $article_url;
} }
$item = []; $item = [];
$item['uri'] = $article_url; $item['uri'] = $article_url;
$item['title'] = $article_title; $item['title'] = $article_title;
if ($article_author) { if (!empty($article_author)) {
$item['author'] = $article_author; $item['author'] = $article_author;
} }
$item['enclosures'] = $article_thumbnail; $item['enclosures'] = $article_thumbnail;
$item['timestamp'] = $article_timestamp; $item['timestamp'] = $article_timestamp;
$item['content'] = trim($contents ?? ''); $item['content'] = trim($article_content);
$this->items[] = $item; $this->items[] = $item;
$limit++; $limit++;
} }