[PcGamerBridge] Use meta tags to generate feed contents (#2271)

This commit is contained in:
Matt DeMoss 2021-10-19 02:53:26 -04:00 committed by GitHub
parent 8d0fc54e4d
commit 8f98e07979
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -2,43 +2,33 @@
class PcGamerBridge extends BridgeAbstract class PcGamerBridge extends BridgeAbstract
{ {
const NAME = 'PC Gamer'; const NAME = 'PC Gamer';
const URI = 'https://www.pcgamer.com/archive/'; const URI = 'https://www.pcgamer.com/';
const DESCRIPTION = 'PC Gamer Most Read Stories'; const DESCRIPTION = 'PC Gamer is your source for exclusive reviews, demos,
const CACHE_TIMEOUT = 3600; updates and news on all your favorite PC gaming franchises.';
const MAINTAINER = 'IceWreck, mdemoss'; const MAINTAINER = 'IceWreck, mdemoss';
public function collectData() public function collectData()
{ {
$html = getSimpleHTMLDOMCached($this->getURI(), 300); $html = getSimpleHTMLDOMCached($this->getURI(), 300);
$stories = $html->find('ul.basic-list li.day-article'); $stories = $html->find('a.article-link');
$i = 0;
// Find induvidual stories in the archive page
foreach ($stories as $element) { foreach ($stories as $element) {
if($i == 15) break; $item = array();
$item['uri'] = $element->find('a', 0)->href; $item['uri'] = $element->href;
// error_log(print_r($item['uri'], TRUE));
$articleHtml = getSimpleHTMLDOMCached($item['uri']); $articleHtml = getSimpleHTMLDOMCached($item['uri']);
$item['title'] = $element->find('a', 0)->plaintext;
// Relying on meta tags ought to be more reliable.
$item['title'] = $articleHtml->find('meta[name=parsely-title]', 0)->content;
$item['content'] = html_entity_decode($articleHtml->find('meta[name=description]', 0)->content);
$item['author'] = $articleHtml->find('meta[name=parsely-author]', 0)->content;
$item['enclosures'][] = $articleHtml->find('meta[name=parsely-image-url]', 0)->content;
/* I don't know why every article has two extra tags, but because
one matches another common tag, "guide," it needs to be removed. */
$item['categories'] = array_diff(
explode(',', $articleHtml->find('meta[name=parsely-tags]', 0)->content),
array('van_buying_guide_progressive', 'serversidehawk')
);
$item['timestamp'] = strtotime($articleHtml->find('meta[name=pub_date]', 0)->content); $item['timestamp'] = strtotime($articleHtml->find('meta[name=pub_date]', 0)->content);
$item['author'] = $articleHtml->find('span.by-author a', 0)->plaintext;
// Get the article content
$articleContents = $articleHtml->find('#article-body', 0);
/*
By default the img src has a link to an error image and then the actual image
is added in by JS. So we replace the error image with the actual full size image
whoose link is in one of the attributes of the img tag
*/
foreach($articleContents->find('img') as $img) {
$imgsrc = $img->getAttribute('data-original-mos');
// error_log($imgsrc);
$img->src = $imgsrc;
}
$item['content'] = $articleContents;
$this->items[] = $item; $this->items[] = $item;
$i++;
} }
} }
} }