[ZeitBridge] Remove annoyances, add content

Remove navigational elements, podcast images.
Add many more header images, article content in <ul> (and for ggod
measure in <ol>) and quotes with their content and not only their
author.

Extreme example:
https://www.zeit.de/campus/2024-05/protest-palaestina-universitaet-europa-uebersicht
This commit is contained in:
Mynacol 2024-05-18 16:11:26 +02:00
parent a7ed3d56f9
commit 4d12aa2a9e

View file

@ -87,7 +87,7 @@ class ZeitBridge extends FeedExpander
// remove known bad elements
foreach (
$article->find(
'aside, .visually-hidden, .carousel-container, #tickaroo-liveblog, .zplus-badge, .article-heading__container--podcast, div[data-paywall], .js-embed-consent'
'aside, .visually-hidden, .carousel-container, #tickaroo-liveblog, .zplus-badge, .article-heading__container--podcast, .podcast-player__image, div[data-paywall], .js-embed-consent, script, nav, .article-flexible-toc__subheading-link, .faq-link'
) as $bad
) {
$bad->remove();
@ -114,7 +114,7 @@ class ZeitBridge extends FeedExpander
}
// header image
$headerimg = $article->find('*[data-ct-row="headerimage"]', 0) ?? $article->find('header', 0);
$headerimg = $article->find('*[data-ct-row="headerimage"]', 0) ?? $article->find('.article-header', 0) ?? $article->find('header', 0);
if ($headerimg) {
$item['content'] .= implode('', $headerimg->find('img[src], figcaption'));
}
@ -124,7 +124,7 @@ class ZeitBridge extends FeedExpander
if ($pages) {
foreach ($pages as $page) {
$elements = $page->find('p, h2, figcaption, img[src]');
$elements = $page->find('p, ul, ol, h2, figure.article__media img[src], figure.article__media figcaption, figure.quote');
$item['content'] .= implode('', $elements);
}
}