mirror of
https://github.com/RSS-Bridge/rss-bridge.git
synced 2025-03-14 20:21:14 +03:00
[NationalGeographicBridge] Rewrite bridge (#2177)
- All the option will be preserved. - Add timestamp, author's name included with full article.
This commit is contained in:
parent
cb111a3ebd
commit
8bcf4ebfbf
1 changed files with 242 additions and 74 deletions
|
@ -6,11 +6,12 @@ class NationalGeographicBridge extends BridgeAbstract {
|
|||
const PARAMETER_FULL_ARTICLE = 'full';
|
||||
const TOPIC_MAGAZINE = 'Magazine';
|
||||
const TOPIC_LATEST_STORIES = 'Latest Stories';
|
||||
const CACHE_TIMEOUT = 900; //15 min
|
||||
|
||||
const NAME = 'National Geographic';
|
||||
const URI = 'https://www.nationalgeographic.com/';
|
||||
const DESCRIPTION = 'Fetches the latest articles from the National Geographic Magazine';
|
||||
const MAINTAINER = 'logmanoriginal';
|
||||
const MAINTAINER = 'csisoap';
|
||||
const PARAMETERS = array(
|
||||
self::CONTEXT_BY_TOPIC => array(
|
||||
self::PARAMETER_TOPIC => array(
|
||||
|
@ -28,12 +29,22 @@ class NationalGeographicBridge extends BridgeAbstract {
|
|||
self::PARAMETER_FULL_ARTICLE => array(
|
||||
'name' => 'Full Article',
|
||||
'type' => 'checkbox',
|
||||
'title' => 'Enable to load full articles (takes longer)'
|
||||
'title' => 'Enable to load full articles and other infos (takes longer)'
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
private $topicName = '';
|
||||
const CONTEXT = 'eyJjb250ZW50VHlwZSI6IlVuaXNvbkh1YiIsInZhcmlhYmxlcyI6eyJsb2NhdG9yIjoiL3BhZ2VzL3
|
||||
RvcGljL2xhdGVzdC1zdG9yaWVzIiwicG9ydGZvbGlvIjoibmF0Z2VvIiwicXVlcn
|
||||
lUeXBlIjoiTE9DQVRPUiJ9LCJtb2R1bGVJZCI6bnVsbH0';
|
||||
const LATEST_STORIES_ID = array(
|
||||
'1df278bb-0e3d-4a67-a0ce-8fae48392822-f2-m1'
|
||||
);
|
||||
const MAGAZINE_ID = array(
|
||||
'94d87d74-f41a-4a32-9acd-b591ba2df288-f2-m1',
|
||||
'94d87d74-f41a-4a32-9acd-b591ba2df288-f5-m2',
|
||||
);
|
||||
|
||||
public function getURI() {
|
||||
switch ($this->queriedContext) {
|
||||
|
@ -46,9 +57,16 @@ class NationalGeographicBridge extends BridgeAbstract {
|
|||
}
|
||||
}
|
||||
|
||||
private function getAPIURL($id) {
|
||||
$context = preg_replace('/\s*/m', '', self::CONTEXT);
|
||||
$url = 'https://www.nationalgeographic.com/proxy/hub?context='
|
||||
. $context . '&id=' . $id
|
||||
. '&moduleType=InfiniteFeedModule&_xhr=pageContent';
|
||||
return $url;
|
||||
}
|
||||
|
||||
public function collectData() {
|
||||
$this->topicName = $this->getTopicName($this->getInput(self::PARAMETER_TOPIC));
|
||||
|
||||
switch($this->topicName) {
|
||||
case self::TOPIC_MAGAZINE: {
|
||||
return $this->collectMagazine();
|
||||
|
@ -78,28 +96,35 @@ class NationalGeographicBridge extends BridgeAbstract {
|
|||
}
|
||||
|
||||
private function collectMagazine() {
|
||||
$uri = $this->getURI();
|
||||
$stories = array();
|
||||
|
||||
$html = getSimpleHTMLDOM($uri)
|
||||
or returnServerError('Could not request ' . $uri);
|
||||
foreach(self::MAGAZINE_ID as $id) {
|
||||
$uri = $this->getAPIURL($id);
|
||||
|
||||
$script = $html->find('#lead-component script')[0];
|
||||
$json_raw = getContents($uri);
|
||||
|
||||
$json = json_decode($script->innertext, true);
|
||||
$json = json_decode($json_raw, true)['tiles'];
|
||||
$stories = array_merge($json, $stories);
|
||||
}
|
||||
|
||||
// This is probably going to break in the future, fix it then :)
|
||||
foreach($json['body']['0']['multilayout_promo_beta']['stories'] as $story) {
|
||||
foreach($stories as $story) {
|
||||
$this->addStory($story);
|
||||
}
|
||||
}
|
||||
|
||||
private function collectLatestStories() {
|
||||
$uri = self::URI . 'latest-stories/_jcr_content/content/hubfeed.promo-hub-feed-all-stories.json';
|
||||
$stories = array();
|
||||
|
||||
$json_raw = getContents($uri)
|
||||
or returnServerError('Could not request ' . $uri);
|
||||
foreach(self::LATEST_STORIES_ID as $id) {
|
||||
$uri = $this->getAPIURL($id);
|
||||
|
||||
foreach(json_decode($json_raw, true) as $story) {
|
||||
$json_raw = getContents($uri);
|
||||
|
||||
$json = json_decode($json_raw, true)['tiles'];
|
||||
$stories = array_merge($stories, $json);
|
||||
}
|
||||
|
||||
foreach($stories as $story) {
|
||||
$this->addStory($story);
|
||||
}
|
||||
}
|
||||
|
@ -107,88 +132,231 @@ class NationalGeographicBridge extends BridgeAbstract {
|
|||
private function addStory($story) {
|
||||
$title = 'Unknown title';
|
||||
$content = '';
|
||||
$story_type = '';
|
||||
$uri = '';
|
||||
|
||||
foreach($story['components'] as $component) {
|
||||
switch($component['content_type']) {
|
||||
case 'title': {
|
||||
$title = $component['title']['text'];
|
||||
} break;
|
||||
case 'dek': {
|
||||
$content = $component['dek']['text'];
|
||||
} break;
|
||||
}
|
||||
foreach($story['ctas'] as $component) {
|
||||
$uri = $component['url'];
|
||||
$story_type = $component['icon'];
|
||||
}
|
||||
|
||||
$item = array();
|
||||
|
||||
$item['uri'] = $story['uri'];
|
||||
$item['title'] = $title;
|
||||
if(isset($story['description'])) {
|
||||
$content = '<p>' . $story['description'] . '</p>';
|
||||
}
|
||||
$title = $story['title'];
|
||||
$item['uri'] = $uri;
|
||||
$item['title'] = $story['title'];
|
||||
|
||||
// if full article is requested!
|
||||
if ($this->getInput(self::PARAMETER_FULL_ARTICLE))
|
||||
$item['content'] = $this->getFullArticle($item['uri']);
|
||||
else
|
||||
if ($this->getInput(self::PARAMETER_FULL_ARTICLE)) {
|
||||
if($story_type != 'interactive') {
|
||||
/* Nat Geo doesn't provided much info about interactive page
|
||||
* and it requires JS to load the interactive.
|
||||
*/
|
||||
$article_data = $this->getFullArticle($item['uri']);
|
||||
$item['timestamp'] = $article_data['published_date'];
|
||||
$item['author'] = $article_data['authors'];
|
||||
$item['content'] = $content . $article_data['content'];
|
||||
} else {
|
||||
$item['content'] = $content;
|
||||
}
|
||||
} else
|
||||
$item['content'] = $content;
|
||||
|
||||
if (isset($story['promo_image'])) {
|
||||
switch($story['promo_image']['content_type']) {
|
||||
case 'image': {
|
||||
$item['enclosures'][] = $story['promo_image']['image']['uri'];
|
||||
} break;
|
||||
}
|
||||
}
|
||||
$image = $story['img'];
|
||||
$item['enclosures'][] = $image['src'];
|
||||
|
||||
if (isset($story['lead_media'])) {
|
||||
$media = $story['lead_media'];
|
||||
switch($media['content_type']) {
|
||||
case 'image': {
|
||||
// Don't add if promo_image was added
|
||||
if (empty($item['enclosures']))
|
||||
$item['enclosures'][] = $media['image']['uri'];
|
||||
} break;
|
||||
case 'image_gallery': {
|
||||
foreach($media['image_gallery']['images'] as $image) {
|
||||
$item['enclosures'][] = $image['uri'];
|
||||
}
|
||||
} break;
|
||||
}
|
||||
$tags = $story['tags'];
|
||||
foreach($tags as $tag) {
|
||||
$tag_name = $tag['name'];
|
||||
$item['categories'][] = $tag_name;
|
||||
}
|
||||
|
||||
$this->items[] = $item;
|
||||
}
|
||||
|
||||
private function filterArticleData($data) {
|
||||
$article_module = array_filter(
|
||||
$data, function ($item) {
|
||||
if(isset($item['id']) && $item['id'] == 'natgeo-template1-frame-1') {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
$article_data = array_reduce(
|
||||
$article_module,
|
||||
function (array $carry, array $item) {
|
||||
$module = $item['mods'];
|
||||
return array_merge(
|
||||
$carry,
|
||||
array_filter(
|
||||
$module, function ($data) {
|
||||
return $data['id'] == 'natgeo-template1-frame-1-module-1';
|
||||
}
|
||||
)
|
||||
);
|
||||
},
|
||||
array()
|
||||
);
|
||||
|
||||
return $article_data[0];
|
||||
}
|
||||
|
||||
private function handleImages($image_module, $image_type) {
|
||||
$image_alt = '';
|
||||
$image_credit = '';
|
||||
$image_src = '';
|
||||
$image_caption = '';
|
||||
$caption = '';
|
||||
switch($image_type) {
|
||||
case 'image':
|
||||
case 'imagegroup':
|
||||
$image = $image_module['image'];
|
||||
$image_src = $image['src'];
|
||||
if(isset($image_module['alt'])) {
|
||||
$image_alt = $image_module['alt'];
|
||||
} elseif(isset($image['altText'])) {
|
||||
$image_alt = $image['altText'];
|
||||
}
|
||||
if(isset($image['crdt'])) {
|
||||
$image_credit = $image['crdt'];
|
||||
}
|
||||
$caption = (isset($image_module['caption']) ? $image_module['caption'] : '');
|
||||
break;
|
||||
case 'photogallery':
|
||||
$image_credit = (isset($image_module['caption']['credit']) ? $image_module['caption']['credit'] : '');
|
||||
$caption = $image_module['caption']['text'];
|
||||
$image_src = $image_module['img']['src'];
|
||||
$image_alt = $image_module['img']['altText'];
|
||||
break;
|
||||
case 'video':
|
||||
$image_credit = (isset($image_module['credit']) ? $image_module['credit'] : '');
|
||||
$description = (isset($image_module['description']) ? $image_module['description'] : '');
|
||||
$caption = $description . ' Video can be watched on the article\'s page';
|
||||
$image = $image_module['image'];
|
||||
$image_alt = $image['altText'];
|
||||
$image_src = $image['src'];
|
||||
}
|
||||
|
||||
$image_caption = $caption . ' ' . $image_credit
|
||||
. '. Notes: Some image may have copyrighted on it.';
|
||||
$wrapper = <<<EOD
|
||||
<figure>
|
||||
<img src="{$image_src}" alt="{$image_alt}">
|
||||
<figcaption>$image_caption</figcaption>
|
||||
</figure>
|
||||
EOD;
|
||||
return $wrapper;
|
||||
}
|
||||
|
||||
private function getFullArticle($uri) {
|
||||
$html = getSimpleHTMLDOMCached($uri)
|
||||
$html = getContents($uri)
|
||||
or returnServerError('Could not load ' . $uri);
|
||||
|
||||
$html = defaultLinkTo($html, $uri);
|
||||
$scriptRegex = '/window\[\'__natgeo__\'\]=(.*);<\/script>/';
|
||||
|
||||
$content = '';
|
||||
preg_match($scriptRegex, $html, $matches, PREG_OFFSET_CAPTURE, 0);
|
||||
|
||||
foreach($html->find('
|
||||
.content > .smartbody.text,
|
||||
.content > .section.image script[type="text/json"],
|
||||
.content > .section.image span[itemprop="caption"],
|
||||
.content > .section.inline script[type="text/json"]
|
||||
') as $element) {
|
||||
if ($element->tag === 'script') {
|
||||
$json = json_decode($element->innertext, true);
|
||||
if (isset($json['src'])) {
|
||||
$content .= '<img src="' . $json['src'] . '" width="100%" alt="' . $json['alt'] . '">';
|
||||
} elseif (isset($json['galleryType']) && isset($json['endpoint'])) {
|
||||
$doc = getContents($json['endpoint'])
|
||||
or returnServerError('Could not load ' . $json['endpoint']);
|
||||
$json = json_decode($doc, true);
|
||||
foreach($json['items'] as $item) {
|
||||
$content .= '<p>' . $item['caption'] . '</p>';
|
||||
$content .= '<img src="' . $item['url'] . '" width="100%" alt="' . $item['caption'] . '">';
|
||||
}
|
||||
}
|
||||
$json = json_decode($matches[1][0], true);
|
||||
|
||||
$unfiltered_data = $json['page']['content']['article']['frms'];
|
||||
$filtered_data = $this->filterArticleData($unfiltered_data);
|
||||
|
||||
$article = $filtered_data['edgs'][0];
|
||||
|
||||
$contributors = $article['cntrbGrp'];
|
||||
$authors = array();
|
||||
if(count($contributors) > 0) {
|
||||
$authors = $contributors[0]['contributors'];
|
||||
}
|
||||
|
||||
$authors_name = '';
|
||||
$counter = 0;
|
||||
foreach($authors as $author) {
|
||||
$counter++;
|
||||
if($counter == count($authors)) {
|
||||
$authors_name .= $author['displayName'];
|
||||
} else {
|
||||
$content .= $element->outertext;
|
||||
$authors_name .= $author['displayName'] . ', ';
|
||||
}
|
||||
}
|
||||
|
||||
return $content;
|
||||
$published_date = $article['pbDt'];
|
||||
$article_body = $article['bdy'];
|
||||
$content = '';
|
||||
|
||||
foreach($article_body as $body) {
|
||||
switch($body['type']) {
|
||||
case 'p':
|
||||
$content .= '<p>' . $body['cntnt']['mrkup'] . '</p>';
|
||||
break;
|
||||
case 'h2':
|
||||
$content .= '<h2>' . $body['cntnt']['mrkup'] . '</h2>';
|
||||
break;
|
||||
case 'inline':
|
||||
$module = $body['cntnt'];
|
||||
if(empty($module))
|
||||
continue 2;
|
||||
switch($module['cmsType']) {
|
||||
case 'image':
|
||||
$content .= $this->handleImages($module, $module['cmsType']);
|
||||
break;
|
||||
case 'imagegroup':
|
||||
$images = $module['images'];
|
||||
foreach($images as $image) {
|
||||
$content .= $this->handleImages($image, $module['cmsType']);
|
||||
}
|
||||
break;
|
||||
case 'editorsNote':
|
||||
$content .= $module['note'];
|
||||
break;
|
||||
case 'listicle':
|
||||
$content .= '<h2>' . $module['title'] . '</h2>';
|
||||
if(isset($module['image'])) {
|
||||
$content .= $this->handleImages($module['image'], $module['image']['cmsType']);
|
||||
}
|
||||
$content .= '<p>' . (isset($module['text']) ? $module['text'] : '') . '</p>';
|
||||
break;
|
||||
case 'photogallery':
|
||||
$gallery = $body['cntnt']['media'];
|
||||
foreach($gallery as $image) {
|
||||
$content .= $this->handleImages($image, $module['cmsType']);
|
||||
}
|
||||
break;
|
||||
case 'video':
|
||||
$content .= $this->handleImages($module, $module['cmsType']);
|
||||
break;
|
||||
case 'pullquote';
|
||||
$quote = $module['quote'];
|
||||
$author_name = '';
|
||||
$authors = (isset($module['byLineProps']['authors']) ? $module['byLineProps']['authors'] : array());
|
||||
foreach($authors as $author) {
|
||||
$author_desc = (isset($author['authorDesc']) ? $author['authorDesc'] : '');
|
||||
$author_name .= $author['displayName'] . ', ' . $author_desc;
|
||||
}
|
||||
$content .= <<<EOD
|
||||
<figure>
|
||||
<blockquote>
|
||||
<p>$quote</p>
|
||||
</blockquote>
|
||||
<figcaption>$author_name</figcaption>
|
||||
</figure>
|
||||
EOD;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case 'ul':
|
||||
$content .= $body['cntnt']['mrkup'] . '<hr>';
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return array(
|
||||
'content' => $content,
|
||||
'published_date' => $published_date,
|
||||
'authors' => $authors_name
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue