[HeiseBridge] Parser rewrite (#3054)

* [HeiseBridge] Parser rewrite

This rewrite is more readable and consistent than the previous one.

Additionally, this removes unwanted elements, largely recommendations
for other articles.
Furthermore, it increases the image quality by using the original
picture link instead of the compressed ones.

* [HeiseBridge] Formatting
This commit is contained in:
Mynacol 2022-09-21 21:31:43 +02:00 committed by GitHub
parent 3d9fead463
commit 8d8fe66aab
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -61,24 +61,42 @@ class HeiseBridge extends FeedExpander
private function addArticleToItem($item, $article)
{
$authors = $article->find('.a-creator__names', 0)->find('.a-creator__name');
// copy full-res img src to standard img element
foreach ($article->find('a-img') as $aimg) {
$img = $aimg->find('img', 0);
$img->src = $aimg->src;
// client scales based on aspect ratio in style attribute
$img->width = '';
$img->height = '';
}
// relink URIs, as the previous a-img tags weren't recognized by this function
$article = defaultLinkTo($article, $item['uri']);
// remove unwanted stuff
foreach ($article->find('figure.branding, a-ad, div.ho-text, noscript img, .opt-in__content-container') as $element) {
$element->remove();
}
// reload html, as remove() is buggy
$article = str_get_html($article->outertext);
$header = $article->find('header.a-article-header', 0);
$headerElements = $header->find('p, a-img img, figure img');
$item['content'] = implode('', $headerElements);
$authors = $header->find('.a-creator__names .a-creator__name');
if ($authors) {
$item['author'] = implode(', ', array_map(function ($e) {
return $e->plaintext;
}, $authors));
}
$content = $article->find('div[class*="article-content"]', 0);
$content = $article->find('.article-content', 0);
$contentElements = $content->find(
'p, h3, ul, table, pre, a-img img, a-bilderstrecke h2, a-bilderstrecke figure, a-bilderstrecke figcaption'
);
$item['content'] .= implode('', $contentElements);
if ($content == null) {
$content = $article->find('#article_content', 0);
}
foreach ($content->find('p, h3, ul, table, pre, img') as $element) {
$item['content'] .= $element;
}
foreach ($content->find('img') as $img) {
foreach ($article->find('a-img img, a-bilderstrecke img, figure img') as $img) {
$item['enclosures'][] = $img->src;
}