2020-11-11 20:39:34 +03:00
|
|
|
<?php
|
|
|
|
class SeznamZpravyBridge extends BridgeAbstract {
|
|
|
|
const NAME = 'Seznam Zprávy Bridge';
|
|
|
|
const URI = 'https://seznamzpravy.cz';
|
|
|
|
const DESCRIPTION = 'Returns newest stories from Seznam Zprávy';
|
|
|
|
const MAINTAINER = 'thezeroalpha';
|
|
|
|
const PARAMETERS = array(
|
|
|
|
'By Author' => array(
|
|
|
|
'author' => array(
|
|
|
|
'name' => 'Author String',
|
|
|
|
'type' => 'text',
|
|
|
|
'required' => true,
|
|
|
|
'title' => 'The dash-separated author string, as shown in the URL bar.',
|
|
|
|
'pattern' => '[a-z]+-[a-z]+-[0-9]+',
|
2022-04-14 15:38:16 +03:00
|
|
|
'exampleValue' => 'radek-nohl-1'
|
2020-11-11 20:39:34 +03:00
|
|
|
),
|
|
|
|
)
|
|
|
|
);
|
|
|
|
|
|
|
|
private $feedName;
|
|
|
|
|
|
|
|
public function getName() {
|
|
|
|
if (isset($this->feedName)) {
|
|
|
|
return $this->feedName;
|
|
|
|
}
|
|
|
|
return parent::getName();
|
|
|
|
}
|
|
|
|
|
|
|
|
public function collectData() {
|
|
|
|
$ONE_DAY = 86500;
|
|
|
|
switch($this->queriedContext) {
|
|
|
|
case 'By Author':
|
|
|
|
$url = 'https://www.seznamzpravy.cz/autor/';
|
|
|
|
$selectors = array(
|
|
|
|
'breadcrumbs' => 'div[data-dot=ogm-breadcrumb-navigation]',
|
2022-04-14 15:38:16 +03:00
|
|
|
'articleList' => 'ul.ogm-document-timeline-page li article[data-dot=mol-timeline-item]',
|
|
|
|
'articleTitle' => 'a[data-dot=mol-article-card-title]',
|
|
|
|
'articleDM' => 'span.mol-formatted-date__date',
|
|
|
|
'articleTime' => 'span.mol-formatted-date__time',
|
|
|
|
'articleContent' => 'div[data-dot=ogm-article-content]',
|
|
|
|
'articleImage' => 'div[data-dot=ogm-main-media] img',
|
|
|
|
'articleParagraphs' => 'div[data-dot=mol-paragraph]'
|
2020-11-11 20:39:34 +03:00
|
|
|
);
|
|
|
|
|
|
|
|
$html = getSimpleHTMLDOMCached($url . $this->getInput('author'), $ONE_DAY);
|
2022-04-14 15:38:16 +03:00
|
|
|
$mainBreadcrumbs = $html->find($selectors['breadcrumbs'], 0)
|
|
|
|
or returnServerError('Could not get breadcrumbs for: ' . $this->getURI());
|
|
|
|
|
|
|
|
$author = $mainBreadcrumbs->last_child()->plaintext
|
|
|
|
or returnServerError('Could not get author for: ' . $this->getURI());
|
|
|
|
|
2020-11-11 20:39:34 +03:00
|
|
|
$this->feedName = $author . ' - Seznam Zprávy';
|
|
|
|
|
2022-04-14 15:38:16 +03:00
|
|
|
$articles = $html->find($selectors['articleList'])
|
|
|
|
or returnServerError('Could not find articles for: ' . $this->getURI());
|
2020-11-11 20:39:34 +03:00
|
|
|
|
|
|
|
foreach ($articles as $article) {
|
2022-04-14 15:38:16 +03:00
|
|
|
// Get article URL
|
|
|
|
$titleLink = $article->find($selectors['articleTitle'], 0)
|
|
|
|
or returnServerError('Could not find title for: ' . $this->getURI());
|
|
|
|
$articleURL = $titleLink->href;
|
|
|
|
|
|
|
|
$articleContentHTML = getSimpleHTMLDOMCached($articleURL, $ONE_DAY);
|
|
|
|
|
|
|
|
// Article header image
|
|
|
|
$articleImageElem = $articleContentHTML->find($selectors['articleImage'], 0);
|
|
|
|
|
|
|
|
// Article text content
|
|
|
|
$contentElem = $articleContentHTML->find($selectors['articleContent'], 0)
|
|
|
|
or returnServerError('Could not get article content for: ' . $articleURL);
|
|
|
|
$contentParagraphs = $contentElem->find($selectors['articleParagraphs'])
|
|
|
|
or returnServerError('Could not find paragraphs for: ' . $articleURL);
|
|
|
|
|
|
|
|
// If the article has an image, put that image at the start
|
|
|
|
$contentInitialValue = isset($articleImageElem) ? $articleImageElem->outertext : '';
|
|
|
|
$contentText = array_reduce($contentParagraphs, function($s, $elem) {
|
|
|
|
return $s . $elem->innertext;
|
|
|
|
}, $contentInitialValue);
|
|
|
|
|
|
|
|
// Article categories
|
|
|
|
$breadcrumbsElem = $articleContentHTML->find($selectors['breadcrumbs'], 0)
|
|
|
|
or returnServerError('Could not find breadcrumbs for: ' . $articleURL);
|
|
|
|
$breadcrumbs = $breadcrumbsElem->children();
|
|
|
|
$numBreadcrumbs = count($breadcrumbs);
|
2020-11-11 20:39:34 +03:00
|
|
|
$categories = array();
|
|
|
|
foreach ($breadcrumbs as $cat) {
|
2022-04-14 15:38:16 +03:00
|
|
|
if (--$numBreadcrumbs <= 0) {
|
2020-11-11 20:39:34 +03:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
$categories[] = trim($cat->plaintext);
|
|
|
|
}
|
|
|
|
|
2022-04-14 15:38:16 +03:00
|
|
|
// Article date & time
|
|
|
|
$articleTimeElem = $article->find($selectors['articleTime'], 0)
|
|
|
|
or returnServerError('Could not find article time for: ' . $articleURL);
|
|
|
|
$articleTime = $articleTimeElem->plaintext;
|
|
|
|
|
|
|
|
$articleDMElem = $article->find($selectors['articleDM'], 0);
|
|
|
|
if (isset($articleDMElem)) {
|
|
|
|
$articleDMText = $articleDMElem->plaintext;
|
|
|
|
} else {
|
|
|
|
// If there is no date but only a time, the article was published today
|
|
|
|
$articleDMText = date('d.m.');
|
|
|
|
}
|
|
|
|
$articleDMY = preg_replace('/[^0-9\.]/', '', $articleDMText) . date('Y');
|
|
|
|
|
|
|
|
// Add article to items, potentially with header image as enclosure
|
2020-11-11 20:39:34 +03:00
|
|
|
$item = array(
|
2022-04-14 15:38:16 +03:00
|
|
|
'title' => $titleLink->plaintext,
|
|
|
|
'uri' => $titleLink->href,
|
|
|
|
'timestamp' => strtotime($articleDMY . ' ' . $articleTime),
|
2020-11-11 20:39:34 +03:00
|
|
|
'author' => $author,
|
2022-04-14 15:38:16 +03:00
|
|
|
'content' => $contentText,
|
2020-11-11 20:39:34 +03:00
|
|
|
'categories' => $categories
|
|
|
|
);
|
2022-04-14 15:38:16 +03:00
|
|
|
if (isset($articleImageElem)) {
|
|
|
|
$item['enclosures'] = array('https:' . $articleImageElem->src);
|
|
|
|
}
|
2020-11-11 20:39:34 +03:00
|
|
|
$this->items[] = $item;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
$this->items[] = $item;
|
|
|
|
}
|
|
|
|
}
|