[HarvardHealthBlogBridge] New (#4116)

This commit is contained in:
tillcash 2024-05-30 00:46:10 +05:30 committed by GitHub
parent 5a68ee0c87
commit bd90109c70
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -0,0 +1,56 @@
<?php
class HarvardHealthBlogBridge extends BridgeAbstract
{
const NAME = 'Harvard Health Blog';
const URI = 'https://www.health.harvard.edu/blog';
const DESCRIPTION = 'Retrieve articles from health.harvard.edu';
const MAINTAINER = 'tillcash';
const MAX_ARTICLES = 10;
public function collectData()
{
$dom = getSimpleHTMLDOM(self::URI);
$count = 0;
foreach ($dom->find('div[class="mb-16 md:flex"]') as $element) {
if ($count >= self::MAX_ARTICLES) {
break;
}
$data = $element->find('a[class="hover:text-red transition-colors duration-200"]', 0);
if (!$data) {
continue;
}
$url = $data->href;
$this->items[] = [
'content' => $this->constructContent($url),
'timestamp' => $element->find('time', 0)->datetime,
'title' => $data->plaintext,
'uid' => $url,
'uri' => $url,
];
$count++;
}
}
private function constructContent($url)
{
$dom = getSimpleHTMLDOMCached($url);
$article = $dom->find('div[class*="content-repository-content"]', 0);
if (!$article) {
return 'Content Not Found';
}
// Remove ads
foreach ($article->find('.inline-ad') as $remove) {
$remove->outertext = '';
}
return $article->innertext;
}
}