[DarkReadingBridge] Fix content extraction (#2315)

Also:
- Add article limit (main feed was broken due to too many articles)
- Add support for article thumbnail
This commit is contained in:
ORelio 2021-10-29 22:21:07 +02:00 committed by GitHub
parent b86ed70376
commit 970bdd45f9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -48,22 +48,25 @@ class DarkReadingBridge extends FeedExpander {
if ($feed_id != '000') { if ($feed_id != '000') {
$feed_url .= '?f_n=' . $feed_id . '&f_ln=' . $feed_name; $feed_url .= '?f_n=' . $feed_id . '&f_ln=' . $feed_name;
} }
$this->collectExpandableDatas($feed_url); $this->collectExpandableDatas($feed_url, 20);
} }
protected function parseItem($newsItem){ protected function parseItem($newsItem){
$item = parent::parseItem($newsItem); $item = parent::parseItem($newsItem);
if (empty($item['content']))
return null; //ignore dummy articles
$article = getSimpleHTMLDOMCached($item['uri']) $article = getSimpleHTMLDOMCached($item['uri'])
or returnServerError('Could not request Dark Reading: ' . $item['uri']); or returnServerError('Could not request Dark Reading: ' . $item['uri']);
$item['content'] = $this->extractArticleContent($article); $item['content'] = $this->extractArticleContent($article);
$item['enclosures'] = array(); //remove author profile picture $item['enclosures'] = array(); //remove author profile picture
$image = $article->find('meta[property="og:image"]', 0);
if (is_object($image)) {
$image = $image->content;
$item['enclosures'] = array($image);
}
return $item; return $item;
} }
private function extractArticleContent($article){ private function extractArticleContent($article){
$content = $article->find('div#article-main', 0)->innertext; $content = $article->find('div.article-content', 0)->innertext;
foreach (array( foreach (array(
'<div class="divsplitter', '<div class="divsplitter',
@ -74,8 +77,6 @@ class DarkReadingBridge extends FeedExpander {
$content = stripRecursiveHTMLSection($content, 'div', $div_start); $content = stripRecursiveHTMLSection($content, 'div', $div_start);
} }
$content = stripWithDelimiters($content, '<h1 ', '</h1>');
return $content; return $content;
} }
} }