[FolhaDeSaoPauloBridge]: Small improvements (#1724)

This commit is contained in:
somini 2022-03-24 22:16:02 +00:00 committed by GitHub
parent 5c69577253
commit cb4bc57c72
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -12,13 +12,25 @@ class FolhaDeSaoPauloBridge extends FeedExpander {
'required' => true,
'title' => 'Select the sub-feed (see https://www1.folha.uol.com.br/feed/)',
'exampleValue' => 'emcimadahora/rss091.xml',
)
),
'amount' => array(
'name' => 'Amount of items to fetch',
'type' => 'number',
'defaultValue' => 15,
),
'deep_crawl' => array(
'name' => 'Deep Crawl',
'description' => 'Crawl each item "deeply", that is, return the article contents',
'type' => 'checkbox',
'defaultValue' => true,
),
)
);
protected function parseItem($item){
$item = parent::parseItem($item);
if ($this->getInput('deep_crawl')) {
$articleHTMLContent = getSimpleHTMLDOMCached($item['uri']);
if($articleHTMLContent) {
foreach ($articleHTMLContent->find('div.c-news__body .is-hidden') as $toRemove) {
@ -27,13 +39,17 @@ class FolhaDeSaoPauloBridge extends FeedExpander {
$item_content = $articleHTMLContent->find('div.c-news__body', 0);
if ($item_content) {
$text = $item_content->innertext;
$text = strip_tags($text, '<p><b><a><blockquote><figure><figcaption><img><strong><em>');
$text = strip_tags($text, '<p><b><a><blockquote><figure><figcaption><img><strong><em><ul><li>');
$item['content'] = $text;
$item['uri'] = explode('*', $item['uri'])[1];
}
} else {
Debug::log('???: ' . $item['uri']);
}
}
else {
$item['uri'] = explode('*', $item['uri'])[1];
}
return $item;
}
@ -48,6 +64,6 @@ class FolhaDeSaoPauloBridge extends FeedExpander {
$feed_url = self::URI . '/' . $this->getInput('feed');
}
Debug::log('URL: ' . $feed_url);
$this->collectExpandableDatas($feed_url);
$this->collectExpandableDatas($feed_url, $this->getInput('amount'));
}
}