mirror of
https://github.com/RSS-Bridge/rss-bridge.git
synced 2025-02-16 15:19:55 +03:00
[TheHackerNews] Update content extraction (#3458)
This commit is contained in:
parent
1b02d4f49b
commit
d4bc63ee98
1 changed files with 36 additions and 40 deletions
|
@ -10,6 +10,8 @@ class TheHackerNewsBridge extends BridgeAbstract
|
||||||
public function collectData()
|
public function collectData()
|
||||||
{
|
{
|
||||||
$html = getSimpleHTMLDOM($this->getURI());
|
$html = getSimpleHTMLDOM($this->getURI());
|
||||||
|
$html = convertLazyLoading($html);
|
||||||
|
$html = defaultLinkTo($html, $this->getURI());
|
||||||
$limit = 0;
|
$limit = 0;
|
||||||
|
|
||||||
foreach ($html->find('div.body-post') as $element) {
|
foreach ($html->find('div.body-post') as $element) {
|
||||||
|
@ -17,74 +19,68 @@ class TheHackerNewsBridge extends BridgeAbstract
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Author (not present on home page)
|
||||||
$article_author = null;
|
$article_author = null;
|
||||||
$icon_user = $element->find('i.icon-user', 0);
|
|
||||||
if ($icon_user) {
|
// Title
|
||||||
$article_author = trim($icon_user->parent()->plaintext);
|
|
||||||
$article_author = str_replace('', '', $article_author);
|
|
||||||
}
|
|
||||||
$article_title = $element->find('h2.home-title', 0)->plaintext;
|
$article_title = $element->find('h2.home-title', 0)->plaintext;
|
||||||
|
|
||||||
|
// Date
|
||||||
$article_timestamp = time();
|
$article_timestamp = time();
|
||||||
//Date without time
|
|
||||||
$calendar = $element->find('i.icon-calendar', 0);
|
$calendar = $element->find('i.icon-calendar', 0);
|
||||||
if ($calendar) {
|
if ($calendar) {
|
||||||
$article_timestamp = strtotime(
|
$article_timestamp = strtotime(
|
||||||
extractFromDelimiters(
|
extractFromDelimiters(
|
||||||
$calendar->parent()->outertext,
|
$calendar->parent()->outertext,
|
||||||
'</i>',
|
'</i>',
|
||||||
'<span>'
|
'</span>'
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
//Article thumbnail in lazy-loading image
|
// Thumbnail
|
||||||
if (is_object($element->find('img[data-echo]', 0))) {
|
$article_thumbnail = [];
|
||||||
$article_thumbnail = [
|
if (is_object($element->find('img', 0))) {
|
||||||
extractFromDelimiters(
|
$article_thumbnail = [ $element->find('img', 0)->src ];
|
||||||
$element->find('img[data-echo]', 0)->outertext,
|
|
||||||
"data-echo='",
|
|
||||||
"'"
|
|
||||||
)
|
|
||||||
];
|
|
||||||
} else {
|
|
||||||
$article_thumbnail = [];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Content (truncated)
|
||||||
|
$article_content = $element->find('div.home-desc', 0)->plaintext;
|
||||||
|
|
||||||
|
// Now try expanding article
|
||||||
$article_url = $element->find('a.story-link', 0)->href;
|
$article_url = $element->find('a.story-link', 0)->href;
|
||||||
$article = getSimpleHTMLDOMCached($article_url);
|
$article_html = getSimpleHTMLDOMCached($article_url);
|
||||||
if ($article) {
|
if ($article_html) {
|
||||||
//Article body
|
// Content (expanded and cleaned)
|
||||||
$var = $article->find('div.articlebody', 0);
|
$article_body = $article_html->find('div.articlebody', 0);
|
||||||
if ($var) {
|
if ($article_body) {
|
||||||
$contents = $var->innertext;
|
$article_body = convertLazyLoading($article_body);
|
||||||
$contents = stripRecursiveHtmlSection($contents, 'div', '<div class="ad_');
|
$article_body = defaultLinkTo($article_body, $article_url);
|
||||||
$contents = stripWithDelimiters($contents, 'id="google_ads', '</iframe>');
|
$header_img = $article_body->find('img', 0);
|
||||||
$contents = stripWithDelimiters($contents, '<script', '</script>');
|
if ($header_img) {
|
||||||
|
$header_img->parent->style = '';
|
||||||
|
}
|
||||||
|
foreach ($article_body->find('center.cf') as $center_ad) {
|
||||||
|
$center_ad->outertext = '';
|
||||||
|
}
|
||||||
|
$article_content = $article_body->innertext;
|
||||||
}
|
}
|
||||||
//Date with time
|
// Author
|
||||||
if (is_object($article->find('meta[itemprop=dateModified]', 0))) {
|
$spans_author = $article_html->find('span.author');
|
||||||
$article_timestamp = strtotime(
|
if (count($spans_author) > 0) {
|
||||||
extractFromDelimiters(
|
$article_author = $spans_author[array_key_last($spans_author)]->plaintext;
|
||||||
$article->find('meta[itemprop=dateModified]', 0)->outertext,
|
|
||||||
"content='",
|
|
||||||
"'"
|
|
||||||
)
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
$contents = 'Could not request TheHackerNews: ' . $article_url;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
$item = [];
|
$item = [];
|
||||||
$item['uri'] = $article_url;
|
$item['uri'] = $article_url;
|
||||||
$item['title'] = $article_title;
|
$item['title'] = $article_title;
|
||||||
if ($article_author) {
|
if (!empty($article_author)) {
|
||||||
$item['author'] = $article_author;
|
$item['author'] = $article_author;
|
||||||
}
|
}
|
||||||
$item['enclosures'] = $article_thumbnail;
|
$item['enclosures'] = $article_thumbnail;
|
||||||
$item['timestamp'] = $article_timestamp;
|
$item['timestamp'] = $article_timestamp;
|
||||||
$item['content'] = trim($contents ?? '');
|
$item['content'] = trim($article_content);
|
||||||
$this->items[] = $item;
|
$this->items[] = $item;
|
||||||
$limit++;
|
$limit++;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue