2020-02-04 19:19:39 +03:00
|
|
|
<?php
|
|
|
|
|
|
|
|
class FolhaDeSaoPauloBridge extends FeedExpander
|
|
|
|
{
|
|
|
|
const MAINTAINER = 'somini';
|
|
|
|
const NAME = 'Folha de São Paulo';
|
|
|
|
const URI = 'https://www1.folha.uol.com.br';
|
|
|
|
const DESCRIPTION = 'Returns the newest posts from Folha de São Paulo (full text)';
|
|
|
|
const PARAMETERS = [
|
2022-07-01 16:10:30 +03:00
|
|
|
[
|
2020-02-04 19:19:39 +03:00
|
|
|
'feed' => [
|
|
|
|
'name' => 'Feed sub-URL',
|
|
|
|
'type' => 'text',
|
2022-03-24 13:59:34 +03:00
|
|
|
'required' => true,
|
2020-02-04 19:19:39 +03:00
|
|
|
'title' => 'Select the sub-feed (see https://www1.folha.uol.com.br/feed/)',
|
|
|
|
'exampleValue' => 'emcimadahora/rss091.xml',
|
2022-07-01 16:10:30 +03:00
|
|
|
],
|
2022-03-25 01:16:02 +03:00
|
|
|
'amount' => [
|
2020-02-04 19:19:39 +03:00
|
|
|
'name' => 'Amount of items to fetch',
|
2022-03-25 01:16:02 +03:00
|
|
|
'type' => 'number',
|
|
|
|
'defaultValue' => 15,
|
2022-07-01 16:10:30 +03:00
|
|
|
],
|
2022-03-25 01:16:02 +03:00
|
|
|
'deep_crawl' => [
|
|
|
|
'name' => 'Deep Crawl',
|
2020-02-04 19:19:39 +03:00
|
|
|
'description' => 'Crawl each item "deeply", that is, return the article contents',
|
2022-03-25 01:16:02 +03:00
|
|
|
'type' => 'checkbox',
|
|
|
|
'defaultValue' => true,
|
2022-07-01 16:10:30 +03:00
|
|
|
],
|
|
|
|
]
|
2020-02-04 19:19:39 +03:00
|
|
|
];
|
|
|
|
|
2022-03-25 01:16:02 +03:00
|
|
|
protected function parseItem($item)
|
|
|
|
{
|
|
|
|
$item = parent::parseItem($item);
|
2020-02-04 19:19:39 +03:00
|
|
|
|
2022-03-25 01:16:02 +03:00
|
|
|
if ($this->getInput('deep_crawl')) {
|
2020-02-04 19:19:39 +03:00
|
|
|
$articleHTMLContent = getSimpleHTMLDOMCached($item['uri']);
|
2022-03-25 01:16:02 +03:00
|
|
|
if ($articleHTMLContent) {
|
2020-02-04 19:19:39 +03:00
|
|
|
foreach ($articleHTMLContent->find('div.c-news__body .is-hidden') as $toRemove) {
|
2022-03-25 01:16:02 +03:00
|
|
|
$toRemove->innertext = '';
|
2022-07-01 16:10:30 +03:00
|
|
|
}
|
2022-03-25 01:16:02 +03:00
|
|
|
$item_content = $articleHTMLContent->find('div.c-news__body', 0);
|
|
|
|
if ($item_content) {
|
|
|
|
$text = $item_content->innertext;
|
|
|
|
$text = strip_tags($text, '<p><b><a><blockquote><figure><figcaption><img><strong><em><ul><li>');
|
|
|
|
$item['content'] = $text;
|
|
|
|
$item['uri'] = explode('*', $item['uri'])[1];
|
2020-02-04 19:19:39 +03:00
|
|
|
}
|
2022-07-01 16:10:30 +03:00
|
|
|
} else {
|
2022-03-25 01:16:02 +03:00
|
|
|
Debug::log('???: ' . $item['uri']);
|
2022-07-01 16:10:30 +03:00
|
|
|
}
|
|
|
|
} else {
|
2022-03-25 01:16:02 +03:00
|
|
|
$item['uri'] = explode('*', $item['uri'])[1];
|
2022-07-01 16:10:30 +03:00
|
|
|
}
|
2020-02-04 19:19:39 +03:00
|
|
|
|
2022-03-25 01:16:02 +03:00
|
|
|
return $item;
|
2022-07-01 16:10:30 +03:00
|
|
|
}
|
|
|
|
|
2020-02-04 19:19:39 +03:00
|
|
|
public function collectData()
|
|
|
|
{
|
|
|
|
$feed_input = $this->getInput('feed');
|
|
|
|
if (substr($feed_input, 0, strlen(self::URI)) === self::URI) {
|
|
|
|
Debug::log('Input:: ' . $feed_input);
|
|
|
|
$feed_url = $feed_input;
|
|
|
|
} else {
|
|
|
|
/* TODO: prepend `/` if missing */
|
|
|
|
$feed_url = self::URI . '/' . $this->getInput('feed');
|
|
|
|
}
|
|
|
|
Debug::log('URL: ' . $feed_url);
|
2022-04-10 19:56:24 +03:00
|
|
|
$limit = $this->getInput('amount');
|
|
|
|
$this->collectExpandableDatas($feed_url, $limit);
|
2020-02-04 19:19:39 +03:00
|
|
|
}
|
|
|
|
}
|