[ 'name' => 'Site URL: Page with latest articles', 'exampleValue' => 'https://example.com/blog/', 'required' => true ], 'cookie' => [ 'name' => '[Optional] Cookie', 'title' => << 'sessionId=deadb33f' ], 'title_cleanup' => [ 'name' => '[Optional] Text to remove from feed title', 'title' => << ' | BlogName', ], 'entry_element_selector' => [ 'name' => 'Selector for article entry elements', 'title' => <<... on home page, each one being treated as a feed item. Use the URL selector option to select the `a` element with the `href` to the article link. If this option is not configured, the first encountered `a` element is used. EOT, 'exampleValue' => 'div.article', 'required' => true ], 'url_selector' => [ 'name' => '[Optional] Selector for link elements', 'title' => << 'a.article', 'defaultValue' => 'a' ], 'url_pattern' => [ 'name' => '[Optional] Pattern for site URLs to keep in feed', 'title' => 'Optionally filter items by applying a regular expression on their URL', 'exampleValue' => '/blog/article/.*', ], 'limit' => self::LIMIT, 'use_article_pages' => [ 'name' => 'Load article from page', 'title' => << 'checkbox' ], 'article_page_content_selector' => [ 'name' => '[Optional] Selector to select article element', 'title' => 'Extract the article from its page using the provided selector', 'exampleValue' => 'article.content', ], 'content_cleanup' => [ 'name' => '[Optional] Content cleanup: selector for items to remove', 'title' => 'Selector for unnecessary elements to remove inside article contents.', 'exampleValue' => 'div.ads, div.comments', ], 'title_selector' => [ 'name' => '[Optional] Selector for the article title', 'title' => 'Selector to select the article title', 'defaultValue' => 'h1' ], 'category_selector' => [ 'name' => '[Optional] Categories', 'title' => << 'span.category, #main-category' ], 'author_selector' => [ 'name' => '[Optional] Author', 'title' => << 'span#author' ], 'time_selector' => [ 'name' => '[Optional] Time selector', 'title' => << [ 'name' => '[Optional] Format string for parsing time', 'title' => << [ 'name' => '[Optional] Remove styling', 'title' => 'Remove class and style attributes from the page elements', 'type' => 'checkbox' ] ] ]; private $feedName = ''; public function getURI() { $url = $this->getInput('home_page'); if (empty($url)) { $url = parent::getURI(); } return $url; } public function getName() { if (!empty($this->feedName)) { return $this->feedName; } return parent::getName(); } protected function getHeaders() { $headers = []; $cookie = $this->getInput('cookie'); if (!empty($cookie)) { $headers[] = 'Cookie: ' . $cookie; } return $headers; } public function collectData() { $url = $this->getInput('home_page'); $headers = $this->getHeaders(); $entry_element_selector = $this->getInput('entry_element_selector'); $url_selector = $this->getInput('url_selector'); $url_pattern = $this->getInput('url_pattern'); $limit = $this->getInput('limit') ?? 10; $use_article_pages = $this->getInput('use_article_pages'); $article_page_content_selector = $this->getInput('article_page_content_selector'); $content_cleanup = $this->getInput('content_cleanup'); $title_selector = $this->getInput('title_selector'); $title_cleanup = $this->getInput('title_cleanup'); $time_selector = $this->getInput('time_selector'); $time_format = $this->getInput('time_format'); $category_selector = $this->getInput('category_selector'); $author_selector = $this->getInput('author_selector'); $remove_styling = $this->getInput('remove_styling'); $html = defaultLinkTo(getSimpleHTMLDOM($url, $headers), $url); $this->feedName = $this->getTitle($html, $title_cleanup); $entry_elements = $this->htmlFindEntryElements($html, $entry_element_selector, $url_selector, $url_pattern, $limit); if (empty($entry_elements)) { return; } // Fetch the elements from the article pages. if ($use_article_pages) { if (empty($article_page_content_selector)) { returnClientError('`Article selector` is required when `Load article page` is enabled'); } foreach (array_keys($entry_elements) as $uri) { $entry_elements[$uri] = $this->fetchArticleElementFromPage($uri, $article_page_content_selector); } } foreach ($entry_elements as $uri => $element) { $entry = $this->parseEntryElement( $element, $title_selector, $author_selector, $category_selector, $time_selector, $time_format, $content_cleanup, $this->feedName, $remove_styling ); $entry['uri'] = $uri; $this->items[] = $entry; } } /** * Filter a list of URLs using a pattern and limit * @param array $links List of URLs * @param string $url_pattern Pattern to look for in URLs * @param int $limit Optional maximum amount of URLs to return * @return array Array of URLs */ protected function filterUrlList($links, $url_pattern, $limit = 0) { if (!empty($url_pattern)) { $url_pattern = '/' . str_replace('/', '\/', $url_pattern) . '/'; $links = array_filter($links, function ($url) { return preg_match($url_pattern, $url) === 1; }); } if ($limit > 0 && count($links) > $limit) { $links = array_slice($links, 0, $limit); } return $links; } /** * Retrieve title from webpage URL or DOM * @param string|object $page URL or DOM to retrieve title from * @param string $title_cleanup optional string to remove from webpage title, e.g. " | BlogName" * @return string Webpage title */ protected function getTitle($page, $title_cleanup) { if (is_string($page)) { $page = getSimpleHTMLDOMCached($page); } $title = html_entity_decode($page->find('title', 0)->plaintext); if (!empty($title)) { $title = trim(str_replace($title_cleanup, '', $title)); } return $title; } /** * Remove all elements from HTML content matching cleanup selector * @param string|object $content HTML content as HTML object or string * @return string|object Cleaned content (same type as input) */ protected function cleanArticleContent($content, $cleanup_selector, $remove_styling) { $string_convert = false; if (is_string($content)) { $string_convert = true; $content = str_get_html($content); } if (!empty($cleanup_selector)) { foreach ($content->find($cleanup_selector) as $item_to_clean) { $item_to_clean->outertext = ''; } } if ($remove_styling) { foreach (['class', 'style'] as $attribute_to_remove) { foreach ($content->find('[' . $attribute_to_remove . ']') as $item_to_clean) { $item_to_clean->removeAttribute($attribute_to_remove); } } } if ($string_convert) { $content = $content->outertext; } return $content; } /** * Retrieve first N link+element from webpage URL or DOM satisfying the specified criteria * @param string|object $page URL or DOM to retrieve feed items from * @param string $entry_selector DOM selector for matching HTML elements that contain article * entries * @param string $url_selector DOM selector for matching links * @param string $url_pattern Optional filter to keep only links matching the pattern * @param int $limit Optional maximum amount of URLs to return * @return array of items { => } */ protected function htmlFindEntryElements($page, $entry_selector, $url_selector, $url_pattern = '', $limit = 0) { if (is_string($page)) { $page = getSimpleHTMLDOM($page); } $entryElements = $page->find($entry_selector); if (empty($entryElements)) { returnClientError('No entry elements for entry selector'); } // Extract URIs with the associated entry element $links_with_elements = []; foreach ($entryElements as $entry) { $url_element = $entry->find($url_selector, 0); if (is_null($url_element)) { // No `a` element found in this entry if ($entry->tag == 'a') { $url_element = $entry; } else { continue; } } $links_with_elements[$url_element->href] = $entry; } if (empty($links_with_elements)) { returnClientError('The provided URL selector matches some elements, but they do not contain links.'); } // Filter using the URL pattern $filtered_urls = $this->filterUrlList(array_keys($links_with_elements), $url_pattern, $limit); if (empty($filtered_urls)) { returnClientError('No results for URL pattern'); } $items = []; foreach ($filtered_urls as $link) { $items[$link] = $links_with_elements[$link]; } return $items; } /** * Retrieve article element from its URL using content selector and return the DOM element * @param string $entry_url URL to retrieve article from * @param string $content_selector HTML selector for extracting content, e.g. "article.content" * @return article DOM element */ protected function fetchArticleElementFromPage($entry_url, $content_selector) { $entry_html = getSimpleHTMLDOMCached($entry_url); $article_content = $entry_html->find($content_selector, 0); if (is_null($article_content)) { returnClientError('Could not article content at URL: ' . $entry_url); } $article_content = defaultLinkTo($article_content, $entry_url); return $article_content; } protected function parseTimeStrAsTimestamp($timeStr, $format) { $date = date_parse_from_format($format, $timeStr); if ($date['error_count'] != 0) { returnClientError('Error while parsing time string'); } $timestamp = mktime( $date['hour'], $date['minute'], $date['second'], $date['month'], $date['day'], $date['year'] ); if ($timestamp == false) { returnClientError('Error while creating timestamp'); } return $timestamp; } /** * Retrieve article content from its URL using content selector and return a feed item * @param object $entry_html A DOM element containing the article * @param string $title_selector A selector to the article title from the article * @param string $author_selector A selector to find the article author * @param string $time_selector A selector to get the article publication time. * @param string $time_format The format to parse the time_selector. * @param string $content_cleanup Optional selector for removing elements, e.g. "div.ads, * div.comments" * @param string $title_default Optional title to use when could not extract title reliably * @param bool $remove_styling Whether to remove class and style attributes from the HTML * @return array Entry data: uri, title, content */ protected function parseEntryElement( $entry_html, $title_selector = null, $author_selector = null, $category_selector = null, $time_selector = null, $time_format = null, $content_cleanup = null, $title_default = null, $remove_styling = false ) { $article_content = convertLazyLoading($entry_html); if (is_null($title_selector)) { $article_title = $title_default; } else { $article_title = trim($entry_html->find($title_selector, 0)->innertext); } $author = null; if (!is_null($author_selector) && $author_selector != '') { $author = trim($entry_html->find($author_selector, 0)->innertext); } $categories = []; if (!is_null($category_selector && $category_selector != '')) { $category_elements = $entry_html->find($category_selector); foreach ($category_elements as $category_element) { $categories[] = trim($category_element->innertext); } } $time = null; if (!is_null($time_selector) && $time_selector != '') { $time_element = $entry_html->find($time_selector, 0); $time = $time_element->getAttribute('datetime'); if (is_null($time)) { $time = $time_element->innertext; } $this->parseTimeStrAsTimestamp($time, $time_format); } $article_content = $this->cleanArticleContent($article_content, $content_cleanup, $remove_styling); $item = []; $item['title'] = $article_title; $item['content'] = $article_content; $item['categories'] = $categories; $item['timestamp'] = $time; $item['author'] = $author; return $item; } }