mirror of
https://github.com/RSS-Bridge/rss-bridge.git
synced 2024-11-22 17:45:40 +03:00
[HeiseBridge] Parser rewrite (#3054)
* [HeiseBridge] Parser rewrite This rewrite is more readable and consistent than the previous one. Additionally, this removes unwanted elements, largely recommendations for other articles. Furthermore, it increases the image quality by using the original picture link instead of the compressed ones. * [HeiseBridge] Formatting
This commit is contained in:
parent
3d9fead463
commit
8d8fe66aab
1 changed files with 29 additions and 11 deletions
|
@ -61,24 +61,42 @@ class HeiseBridge extends FeedExpander
|
|||
|
||||
private function addArticleToItem($item, $article)
|
||||
{
|
||||
$authors = $article->find('.a-creator__names', 0)->find('.a-creator__name');
|
||||
// copy full-res img src to standard img element
|
||||
foreach ($article->find('a-img') as $aimg) {
|
||||
$img = $aimg->find('img', 0);
|
||||
$img->src = $aimg->src;
|
||||
// client scales based on aspect ratio in style attribute
|
||||
$img->width = '';
|
||||
$img->height = '';
|
||||
}
|
||||
// relink URIs, as the previous a-img tags weren't recognized by this function
|
||||
$article = defaultLinkTo($article, $item['uri']);
|
||||
|
||||
// remove unwanted stuff
|
||||
foreach ($article->find('figure.branding, a-ad, div.ho-text, noscript img, .opt-in__content-container') as $element) {
|
||||
$element->remove();
|
||||
}
|
||||
// reload html, as remove() is buggy
|
||||
$article = str_get_html($article->outertext);
|
||||
|
||||
$header = $article->find('header.a-article-header', 0);
|
||||
$headerElements = $header->find('p, a-img img, figure img');
|
||||
$item['content'] = implode('', $headerElements);
|
||||
|
||||
$authors = $header->find('.a-creator__names .a-creator__name');
|
||||
if ($authors) {
|
||||
$item['author'] = implode(', ', array_map(function ($e) {
|
||||
return $e->plaintext;
|
||||
}, $authors));
|
||||
}
|
||||
|
||||
$content = $article->find('div[class*="article-content"]', 0);
|
||||
$content = $article->find('.article-content', 0);
|
||||
$contentElements = $content->find(
|
||||
'p, h3, ul, table, pre, a-img img, a-bilderstrecke h2, a-bilderstrecke figure, a-bilderstrecke figcaption'
|
||||
);
|
||||
$item['content'] .= implode('', $contentElements);
|
||||
|
||||
if ($content == null) {
|
||||
$content = $article->find('#article_content', 0);
|
||||
}
|
||||
|
||||
foreach ($content->find('p, h3, ul, table, pre, img') as $element) {
|
||||
$item['content'] .= $element;
|
||||
}
|
||||
|
||||
foreach ($content->find('img') as $img) {
|
||||
foreach ($article->find('a-img img, a-bilderstrecke img, figure img') as $img) {
|
||||
$item['enclosures'][] = $img->src;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue