From d082bfca4a2b97246cffe40fbb029f94c21480a2 Mon Sep 17 00:00:00 2001 From: Alex Balgavy <8124851+thezeroalpha@users.noreply.github.com> Date: Thu, 14 Apr 2022 14:38:16 +0200 Subject: [PATCH] [SeznamZpravyBridge] fix: broken bridge (#2658) --- bridges/SeznamZpravyBridge.php | 94 +++++++++++++++++++++++----------- 1 file changed, 64 insertions(+), 30 deletions(-) diff --git a/bridges/SeznamZpravyBridge.php b/bridges/SeznamZpravyBridge.php index bfd0f1d0..06072cdd 100644 --- a/bridges/SeznamZpravyBridge.php +++ b/bridges/SeznamZpravyBridge.php @@ -12,7 +12,7 @@ class SeznamZpravyBridge extends BridgeAbstract { 'required' => true, 'title' => 'The dash-separated author string, as shown in the URL bar.', 'pattern' => '[a-z]+-[a-z]+-[0-9]+', - 'exampleValue' => 'janek-rubes-506' + 'exampleValue' => 'radek-nohl-1' ), ) ); @@ -33,55 +33,89 @@ class SeznamZpravyBridge extends BridgeAbstract { $url = 'https://www.seznamzpravy.cz/autor/'; $selectors = array( 'breadcrumbs' => 'div[data-dot=ogm-breadcrumb-navigation]', - 'article_list' => 'ul.ogm-document-timeline-page.atm-list-ul li article[data-dot=mol-timeline-item]', - 'article_title' => 'a[data-dot=mol-article-card-title]', - 'article_dm' => 'span.mol-formatted-date__date', - 'article_time' => 'span.mol-formatted-date__time', - 'article_content' => 'div[data-dot=ogm-article-content]' + 'articleList' => 'ul.ogm-document-timeline-page li article[data-dot=mol-timeline-item]', + 'articleTitle' => 'a[data-dot=mol-article-card-title]', + 'articleDM' => 'span.mol-formatted-date__date', + 'articleTime' => 'span.mol-formatted-date__time', + 'articleContent' => 'div[data-dot=ogm-article-content]', + 'articleImage' => 'div[data-dot=ogm-main-media] img', + 'articleParagraphs' => 'div[data-dot=mol-paragraph]' ); $html = getSimpleHTMLDOMCached($url . $this->getInput('author'), $ONE_DAY); - $main_breadcrumbs = $html->find($selectors['breadcrumbs'], 0); - $author = $main_breadcrumbs->last_child()->plaintext - or returnServerError('Could not get author on: ' . $this->getURI()); + $mainBreadcrumbs = $html->find($selectors['breadcrumbs'], 0) + or returnServerError('Could not get breadcrumbs for: ' . $this->getURI()); + + $author = $mainBreadcrumbs->last_child()->plaintext + or returnServerError('Could not get author for: ' . $this->getURI()); + $this->feedName = $author . ' - Seznam Zprávy'; - $articles = $html->find($selectors['article_list']) - or returnServerError('Could not find articles on: ' . $this->getURI()); + $articles = $html->find($selectors['articleList']) + or returnServerError('Could not find articles for: ' . $this->getURI()); foreach ($articles as $article) { - $title_link = $article->find($selectors['article_title'], 0) - or returnServerError('Could not find title on: ' . $this->getURI()); + // Get article URL + $titleLink = $article->find($selectors['articleTitle'], 0) + or returnServerError('Could not find title for: ' . $this->getURI()); + $articleURL = $titleLink->href; - $article_url = $title_link->href; - $article_content_html = getSimpleHTMLDOMCached($article_url, $ONE_DAY); - $content_e = $article_content_html->find($selectors['article_content'], 0); - $content_text = $content_e->innertext - or returnServerError('Could not get article content for: ' . $article_url); + $articleContentHTML = getSimpleHTMLDOMCached($articleURL, $ONE_DAY); - $breadcrumbs_e = $article_content_html->find($selectors['breadcrumbs'], 0); - $breadcrumbs = $breadcrumbs_e->children(); - $num_breadcrumbs = count($breadcrumbs); + // Article header image + $articleImageElem = $articleContentHTML->find($selectors['articleImage'], 0); + + // Article text content + $contentElem = $articleContentHTML->find($selectors['articleContent'], 0) + or returnServerError('Could not get article content for: ' . $articleURL); + $contentParagraphs = $contentElem->find($selectors['articleParagraphs']) + or returnServerError('Could not find paragraphs for: ' . $articleURL); + + // If the article has an image, put that image at the start + $contentInitialValue = isset($articleImageElem) ? $articleImageElem->outertext : ''; + $contentText = array_reduce($contentParagraphs, function($s, $elem) { + return $s . $elem->innertext; + }, $contentInitialValue); + + // Article categories + $breadcrumbsElem = $articleContentHTML->find($selectors['breadcrumbs'], 0) + or returnServerError('Could not find breadcrumbs for: ' . $articleURL); + $breadcrumbs = $breadcrumbsElem->children(); + $numBreadcrumbs = count($breadcrumbs); $categories = array(); foreach ($breadcrumbs as $cat) { - if (--$num_breadcrumbs <= 0) { + if (--$numBreadcrumbs <= 0) { break; } $categories[] = trim($cat->plaintext); } - $article_dm_e = $article->find($selectors['article_dm'], 0); - $article_dm_text = $article_dm_e->plaintext; - $article_dmy = preg_replace('/[^0-9\.]/', '', $article_dm_text) . date('Y'); - $article_time = $article->find($selectors['article_time'], 0)->plaintext; + // Article date & time + $articleTimeElem = $article->find($selectors['articleTime'], 0) + or returnServerError('Could not find article time for: ' . $articleURL); + $articleTime = $articleTimeElem->plaintext; + + $articleDMElem = $article->find($selectors['articleDM'], 0); + if (isset($articleDMElem)) { + $articleDMText = $articleDMElem->plaintext; + } else { + // If there is no date but only a time, the article was published today + $articleDMText = date('d.m.'); + } + $articleDMY = preg_replace('/[^0-9\.]/', '', $articleDMText) . date('Y'); + + // Add article to items, potentially with header image as enclosure $item = array( - 'title' => $title_link->plaintext, - 'uri' => $title_link->href, - 'timestamp' => strtotime($article_dmy . ' ' . $article_time), + 'title' => $titleLink->plaintext, + 'uri' => $titleLink->href, + 'timestamp' => strtotime($articleDMY . ' ' . $articleTime), 'author' => $author, - 'content' => $content_text, + 'content' => $contentText, 'categories' => $categories ); + if (isset($articleImageElem)) { + $item['enclosures'] = array('https:' . $articleImageElem->src); + } $this->items[] = $item; } break;