mirror of
https://github.com/RSS-Bridge/rss-bridge.git
synced 2025-03-14 20:21:14 +03:00
* [CssSelectorBridge] Improvements (#3537) * Improve parameter documentation / add tooltips * Allow extracting content from home page instead of article page * Keep titles from home page when every page <title> is the same * [CssSelectorBridge] Code linting * [CssSelectorBridge] Code linting (2) * [CssSelectorBridge] Code linting (3)
This commit is contained in:
parent
556bca58cf
commit
977c0db382
1 changed files with 80 additions and 23 deletions
|
@ -15,23 +15,40 @@ class CssSelectorBridge extends BridgeAbstract
|
|||
],
|
||||
'url_selector' => [
|
||||
'name' => 'Selector for article links or their parent elements',
|
||||
'title' => <<<EOT
|
||||
This bridge works using CSS selectors, e.g. "a.article" will match all <a class="article"
|
||||
href="URL">TITLE</a> on home page, each one being treated as a feed item.
|
||||
Instead of just a link you can selet one of its parent element. Everything inside that
|
||||
element becomes feed item content, e.g. image and summary present on home page.
|
||||
When doing so, the first link inside the selected element becomes feed item URL/Title.
|
||||
EOT,
|
||||
'exampleValue' => 'a.article',
|
||||
'required' => true
|
||||
],
|
||||
'url_pattern' => [
|
||||
'name' => '[Optional] Pattern for site URLs to keep in feed',
|
||||
'exampleValue' => 'https://example.com/article/.*',
|
||||
'title' => 'Optionally filter items by applying a regular expression on their URL',
|
||||
'exampleValue' => '/blog/article/.*',
|
||||
],
|
||||
'content_selector' => [
|
||||
'name' => '[Optional] Selector to extract each article content',
|
||||
'name' => '[Optional] Selector to expand each article content',
|
||||
'title' => <<<EOT
|
||||
When specified, the bridge will fetch each article from its URL
|
||||
and extract content using the provided selector (Slower!)
|
||||
EOT,
|
||||
'exampleValue' => 'article.content',
|
||||
],
|
||||
'content_cleanup' => [
|
||||
'name' => '[Optional] Content cleanup: List of items to remove',
|
||||
'title' => 'Selector for unnecessary elements to remove inside article contents.',
|
||||
'exampleValue' => 'div.ads, div.comments',
|
||||
],
|
||||
'title_cleanup' => [
|
||||
'name' => '[Optional] Text to remove from expanded article title',
|
||||
'title' => <<<EOT
|
||||
When fetching each article page, feed item title comes from page title.
|
||||
Specify here some text from page title that need to be removed, e.g. " | BlogName".
|
||||
EOT,
|
||||
'exampleValue' => ' | BlogName',
|
||||
],
|
||||
'limit' => self::LIMIT
|
||||
|
@ -69,7 +86,7 @@ class CssSelectorBridge extends BridgeAbstract
|
|||
|
||||
$html = defaultLinkTo(getSimpleHTMLDOM($url), $url);
|
||||
$this->feedName = $this->getPageTitle($html, $title_cleanup);
|
||||
$items = $this->htmlFindLinks($html, $url_selector, $url_pattern, $limit);
|
||||
$items = $this->htmlFindEntries($html, $url_selector, $url_pattern, $limit, $content_cleanup);
|
||||
|
||||
if (empty($content_selector)) {
|
||||
$this->items = $items;
|
||||
|
@ -79,7 +96,8 @@ class CssSelectorBridge extends BridgeAbstract
|
|||
$item['uri'],
|
||||
$content_selector,
|
||||
$content_cleanup,
|
||||
$title_cleanup
|
||||
$title_cleanup,
|
||||
$item['title']
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -127,30 +145,71 @@ class CssSelectorBridge extends BridgeAbstract
|
|||
}
|
||||
|
||||
/**
|
||||
* Retrieve first N links from webpage URL or DOM satisfying the specified criteria
|
||||
* @param string|object $page URL or DOM to retrieve links from
|
||||
* Remove all elements from HTML content matching cleanup selector
|
||||
* @param string|object $content HTML content as HTML object or string
|
||||
* @return string|object Cleaned content (same type as input)
|
||||
*/
|
||||
protected function cleanArticleContent($content, $cleanup_selector)
|
||||
{
|
||||
$string_convert = false;
|
||||
if (is_string($content)) {
|
||||
$string_convert = true;
|
||||
$content = str_get_html($content);
|
||||
}
|
||||
|
||||
if (!empty($cleanup_selector)) {
|
||||
foreach ($content->find($cleanup_selector) as $item_to_clean) {
|
||||
$item_to_clean->outertext = '';
|
||||
}
|
||||
}
|
||||
|
||||
if ($string_convert) {
|
||||
$content = $content->outertext;
|
||||
}
|
||||
return $content;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve first N link+title+truncated-content from webpage URL or DOM satisfying the specified criteria
|
||||
* @param string|object $page URL or DOM to retrieve feed items from
|
||||
* @param string $url_selector DOM selector for matching links or their parent element
|
||||
* @param string $url_pattern Optional filter to keep only links matching the pattern
|
||||
* @param int $limit Optional maximum amount of URLs to return
|
||||
* @return array of minimal feed items {'uri': entry_url, 'title', entry_title}
|
||||
* @param string $content_cleanup Optional selector for removing elements, e.g. "div.ads, div.comments"
|
||||
* @return array of items {'uri': entry_url, 'title': entry_title, ['content': when present in DOM] }
|
||||
*/
|
||||
protected function htmlFindLinks($page, $url_selector, $url_pattern = '', $limit = 0)
|
||||
protected function htmlFindEntries($page, $url_selector, $url_pattern = '', $limit = 0, $content_cleanup = null)
|
||||
{
|
||||
if (is_string($page)) {
|
||||
$page = getSimpleHTMLDOM($page);
|
||||
}
|
||||
|
||||
$links = $page->find($url_selector);
|
||||
|
||||
if (empty($links)) {
|
||||
returnClientError('No results for URL selector');
|
||||
}
|
||||
|
||||
$link_to_title = [];
|
||||
$link_to_item = [];
|
||||
foreach ($links as $link) {
|
||||
$item = [];
|
||||
if ($link->innertext != $link->plaintext) {
|
||||
$item['content'] = $link->innertext;
|
||||
}
|
||||
if ($link->tag != 'a') {
|
||||
$link = $link->find('a', 0);
|
||||
}
|
||||
$link_to_title[$link->href] = $link->plaintext;
|
||||
$item['uri'] = $link->href;
|
||||
$item['title'] = $link->plaintext;
|
||||
if (isset($item['content'])) {
|
||||
$item['content'] = convertLazyLoading($item['content']);
|
||||
$item['content'] = defaultLinkTo($item['content'], $item['uri']);
|
||||
$item['content'] = $this->cleanArticleContent($item['content'], $content_cleanup);
|
||||
}
|
||||
$link_to_item[$link->href] = $item;
|
||||
}
|
||||
|
||||
$links = $this->filterUrlList(array_keys($link_to_title), $url_pattern, $limit);
|
||||
$links = $this->filterUrlList(array_keys($link_to_item), $url_pattern, $limit);
|
||||
|
||||
if (empty($links)) {
|
||||
returnClientError('No results for URL pattern');
|
||||
|
@ -158,10 +217,7 @@ class CssSelectorBridge extends BridgeAbstract
|
|||
|
||||
$items = [];
|
||||
foreach ($links as $link) {
|
||||
$item = [];
|
||||
$item['uri'] = $link;
|
||||
$item['title'] = $link_to_title[$link];
|
||||
$items[] = $item;
|
||||
$items[] = $link_to_item[$link];
|
||||
}
|
||||
|
||||
return $items;
|
||||
|
@ -173,9 +229,10 @@ class CssSelectorBridge extends BridgeAbstract
|
|||
* @param string $content_selector HTML selector for extracting content, e.g. "article.content"
|
||||
* @param string $content_cleanup Optional selector for removing elements, e.g. "div.ads, div.comments"
|
||||
* @param string $title_cleanup Optional string to remove from article title, e.g. " | BlogName"
|
||||
* @param string $title_default Optional title to use when could not extract title reliably
|
||||
* @return array Entry data: uri, title, content
|
||||
*/
|
||||
protected function expandEntryWithSelector($entry_url, $content_selector, $content_cleanup = null, $title_cleanup = null)
|
||||
protected function expandEntryWithSelector($entry_url, $content_selector, $content_cleanup = null, $title_cleanup = null, $title_default = null)
|
||||
{
|
||||
if (empty($content_selector)) {
|
||||
returnClientError('Please specify a content selector');
|
||||
|
@ -190,18 +247,18 @@ class CssSelectorBridge extends BridgeAbstract
|
|||
returnClientError('Could not find content selector at URL: ' . $entry_url);
|
||||
}
|
||||
|
||||
if (!empty($content_cleanup)) {
|
||||
foreach ($article_content->find($content_cleanup) as $item_to_clean) {
|
||||
$item_to_clean->outertext = '';
|
||||
}
|
||||
}
|
||||
|
||||
$article_content = convertLazyLoading($article_content);
|
||||
$article_content = defaultLinkTo($article_content, $entry_url);
|
||||
$article_content = $this->cleanArticleContent($article_content, $content_cleanup);
|
||||
|
||||
$article_title = $this->getPageTitle($entry_html, $title_cleanup);
|
||||
if (!empty($title_default) && (empty($article_title) || $article_title === $this->feedName)) {
|
||||
$article_title = $title_default;
|
||||
}
|
||||
|
||||
$item = [];
|
||||
$item['uri'] = $entry_url;
|
||||
$item['title'] = $this->getPageTitle($entry_html, $title_cleanup);
|
||||
$item['title'] = $article_title;
|
||||
$item['content'] = $article_content;
|
||||
return $item;
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue