[SeznamZpravyBridge] fix: broken bridge (#2658)

This commit is contained in:
Alex Balgavy 2022-04-14 14:38:16 +02:00 committed by GitHub
parent 91283f3a62
commit d082bfca4a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -12,7 +12,7 @@ class SeznamZpravyBridge extends BridgeAbstract {
'required' => true, 'required' => true,
'title' => 'The dash-separated author string, as shown in the URL bar.', 'title' => 'The dash-separated author string, as shown in the URL bar.',
'pattern' => '[a-z]+-[a-z]+-[0-9]+', 'pattern' => '[a-z]+-[a-z]+-[0-9]+',
'exampleValue' => 'janek-rubes-506' 'exampleValue' => 'radek-nohl-1'
), ),
) )
); );
@ -33,55 +33,89 @@ class SeznamZpravyBridge extends BridgeAbstract {
$url = 'https://www.seznamzpravy.cz/autor/'; $url = 'https://www.seznamzpravy.cz/autor/';
$selectors = array( $selectors = array(
'breadcrumbs' => 'div[data-dot=ogm-breadcrumb-navigation]', 'breadcrumbs' => 'div[data-dot=ogm-breadcrumb-navigation]',
'article_list' => 'ul.ogm-document-timeline-page.atm-list-ul li article[data-dot=mol-timeline-item]', 'articleList' => 'ul.ogm-document-timeline-page li article[data-dot=mol-timeline-item]',
'article_title' => 'a[data-dot=mol-article-card-title]', 'articleTitle' => 'a[data-dot=mol-article-card-title]',
'article_dm' => 'span.mol-formatted-date__date', 'articleDM' => 'span.mol-formatted-date__date',
'article_time' => 'span.mol-formatted-date__time', 'articleTime' => 'span.mol-formatted-date__time',
'article_content' => 'div[data-dot=ogm-article-content]' 'articleContent' => 'div[data-dot=ogm-article-content]',
'articleImage' => 'div[data-dot=ogm-main-media] img',
'articleParagraphs' => 'div[data-dot=mol-paragraph]'
); );
$html = getSimpleHTMLDOMCached($url . $this->getInput('author'), $ONE_DAY); $html = getSimpleHTMLDOMCached($url . $this->getInput('author'), $ONE_DAY);
$main_breadcrumbs = $html->find($selectors['breadcrumbs'], 0); $mainBreadcrumbs = $html->find($selectors['breadcrumbs'], 0)
$author = $main_breadcrumbs->last_child()->plaintext or returnServerError('Could not get breadcrumbs for: ' . $this->getURI());
or returnServerError('Could not get author on: ' . $this->getURI());
$author = $mainBreadcrumbs->last_child()->plaintext
or returnServerError('Could not get author for: ' . $this->getURI());
$this->feedName = $author . ' - Seznam Zprávy'; $this->feedName = $author . ' - Seznam Zprávy';
$articles = $html->find($selectors['article_list']) $articles = $html->find($selectors['articleList'])
or returnServerError('Could not find articles on: ' . $this->getURI()); or returnServerError('Could not find articles for: ' . $this->getURI());
foreach ($articles as $article) { foreach ($articles as $article) {
$title_link = $article->find($selectors['article_title'], 0) // Get article URL
or returnServerError('Could not find title on: ' . $this->getURI()); $titleLink = $article->find($selectors['articleTitle'], 0)
or returnServerError('Could not find title for: ' . $this->getURI());
$articleURL = $titleLink->href;
$article_url = $title_link->href; $articleContentHTML = getSimpleHTMLDOMCached($articleURL, $ONE_DAY);
$article_content_html = getSimpleHTMLDOMCached($article_url, $ONE_DAY);
$content_e = $article_content_html->find($selectors['article_content'], 0);
$content_text = $content_e->innertext
or returnServerError('Could not get article content for: ' . $article_url);
$breadcrumbs_e = $article_content_html->find($selectors['breadcrumbs'], 0); // Article header image
$breadcrumbs = $breadcrumbs_e->children(); $articleImageElem = $articleContentHTML->find($selectors['articleImage'], 0);
$num_breadcrumbs = count($breadcrumbs);
// Article text content
$contentElem = $articleContentHTML->find($selectors['articleContent'], 0)
or returnServerError('Could not get article content for: ' . $articleURL);
$contentParagraphs = $contentElem->find($selectors['articleParagraphs'])
or returnServerError('Could not find paragraphs for: ' . $articleURL);
// If the article has an image, put that image at the start
$contentInitialValue = isset($articleImageElem) ? $articleImageElem->outertext : '';
$contentText = array_reduce($contentParagraphs, function($s, $elem) {
return $s . $elem->innertext;
}, $contentInitialValue);
// Article categories
$breadcrumbsElem = $articleContentHTML->find($selectors['breadcrumbs'], 0)
or returnServerError('Could not find breadcrumbs for: ' . $articleURL);
$breadcrumbs = $breadcrumbsElem->children();
$numBreadcrumbs = count($breadcrumbs);
$categories = array(); $categories = array();
foreach ($breadcrumbs as $cat) { foreach ($breadcrumbs as $cat) {
if (--$num_breadcrumbs <= 0) { if (--$numBreadcrumbs <= 0) {
break; break;
} }
$categories[] = trim($cat->plaintext); $categories[] = trim($cat->plaintext);
} }
$article_dm_e = $article->find($selectors['article_dm'], 0); // Article date & time
$article_dm_text = $article_dm_e->plaintext; $articleTimeElem = $article->find($selectors['articleTime'], 0)
$article_dmy = preg_replace('/[^0-9\.]/', '', $article_dm_text) . date('Y'); or returnServerError('Could not find article time for: ' . $articleURL);
$article_time = $article->find($selectors['article_time'], 0)->plaintext; $articleTime = $articleTimeElem->plaintext;
$articleDMElem = $article->find($selectors['articleDM'], 0);
if (isset($articleDMElem)) {
$articleDMText = $articleDMElem->plaintext;
} else {
// If there is no date but only a time, the article was published today
$articleDMText = date('d.m.');
}
$articleDMY = preg_replace('/[^0-9\.]/', '', $articleDMText) . date('Y');
// Add article to items, potentially with header image as enclosure
$item = array( $item = array(
'title' => $title_link->plaintext, 'title' => $titleLink->plaintext,
'uri' => $title_link->href, 'uri' => $titleLink->href,
'timestamp' => strtotime($article_dmy . ' ' . $article_time), 'timestamp' => strtotime($articleDMY . ' ' . $articleTime),
'author' => $author, 'author' => $author,
'content' => $content_text, 'content' => $contentText,
'categories' => $categories 'categories' => $categories
); );
if (isset($articleImageElem)) {
$item['enclosures'] = array('https:' . $articleImageElem->src);
}
$this->items[] = $item; $this->items[] = $item;
} }
break; break;