rss-bridge/bridges/ZeitBridge.php

<?php

class ZeitBridge extends FeedExpander
{
    const MAINTAINER = 'Mynacol';
    const NAME = 'Zeit Online Bridge';
    const URI = 'https://www.zeit.de/';
    const CACHE_TIMEOUT = 1800; // 30min
    const DESCRIPTION = 'Returns the full articles instead of only the intro';
    const PARAMETERS = [[
        'category' => [
            'name' => 'Category',
            'type' => 'list',
            'values' => [
                'Startseite'
                => 'https://newsfeed.zeit.de/index',
                'Politik'
                => 'https://newsfeed.zeit.de/politik/index',
                'Wirtschaft'
                => 'https://newsfeed.zeit.de/wirtschaft/index',
                'Gesellschaft'
                => 'https://newsfeed.zeit.de/gesellschaft/index',
                'Kultur'
                => 'https://newsfeed.zeit.de/kultur/index',
                'Wissen'
                => 'https://newsfeed.zeit.de/wissen/index',
                'Digital'
                => 'https://newsfeed.zeit.de/digital/index',
                'ZEIT Campus ONLINE'
                => 'https://newsfeed.zeit.de/campus/index',
                'ZEIT ONLINE Arbeit'
                => 'https://newsfeed.zeit.de/arbeit/index',
                'ZEIT Magazin ONLINE'
                => 'https://newsfeed.zeit.de/zeit-magazin/index',
                'Entdecken'
                => 'https://newsfeed.zeit.de/entdecken/index',
                'Mobilität'
                => 'https://newsfeed.zeit.de/mobilitaet/index',
                'Sport'
                => 'https://newsfeed.zeit.de/sport/index',
                'Alle Inhalte'
                => 'https://newsfeed.zeit.de/all'
            ]
        ],
        'limit' => [
            'name' => 'Limit',
            'type' => 'number',
            'required' => false,
            'title' => 'Specify number of full articles to return',
            'defaultValue' => 5
        ]
    ]];

    public function collectData()
    {
        $url = $this->getInput('category');
        $limit = $this->getInput('limit') ?: 5;

        $this->collectExpandableDatas($url, $limit);
    }

    protected function parseItem(array $item)
    {
        $item['enclosures'] = [];

        $headers = [
            'Cookie: zonconsent=' . date('Y-m-d\TH:i:s.v\Z'),
        ];

        // one-page article
        $article = getSimpleHTMLDOM($item['uri'], $headers);
        if ($article->find('a[href="' . $item['uri'] . '/komplettansicht"]', 0)) {
            $item['uri'] .= '/komplettansicht';
            $article = getSimpleHTMLDOM($item['uri'], $headers);
        }

        $article = defaultLinkTo($article, $item['uri']);
        $item = $this->parseArticle($item, $article);

        return $item;
    }

    private function parseArticle($item, $article)
    {
        $article = $article->find('main', 0);

        // remove known bad elements
        foreach (
            $article->find(
                'aside, .visually-hidden, .carousel-container, #tickaroo-liveblog, .zplus-badge, .article-heading__container--podcast, div[data-paywall], .js-embed-consent'
            ) as $bad
        ) {
            $bad->remove();
        }
        // reload html, as remove() is buggy
        $article = str_get_html($article->outertext);

        // podcast audio, if available
        $podcast_src = $article->find('.article-heading__podcast audio[src]', 0);
        if ($podcast_src) {
            $item['enclosures'][] = $podcast_src->src;
        }

        // full res images
        foreach ($article->find('img[data-src]') as $img) {
            $img->src = $img->getAttribute('data-src');
            $item['enclosures'][] = $img->src;
        }

        // authors
        $authors = $article->find('*[itemtype*="schema.org/Person"]');
        if (!$authors) {
            $authors = $article->find('.metadata__source');
        }
        if ($authors) {
            $item['author'] = implode(', ', $authors);
        }

        // header image
        $headerimg = $article->find('*[data-ct-row="headerimage"]', 0) ?? $article->find('header', 0);
        if ($headerimg) {
            $item['content'] .= implode('', $headerimg->find('img[src], figcaption'));
        }

        // article content
        $pages = $article->find('.article-page');

        if ($pages) {
            foreach ($pages as $page) {
                $elements = $page->find('p, h2, figcaption, img[src]');
                $item['content'] .= implode('', $elements);
            }
        }

        return $item;
    }
}
[ZeitBridge] Add bridge for zeit.de (#3056) * [ZeitBridge] Add bridge for zeit.de New bridge expanding the feeds of zeit.de to full-text ones. Circumvents cookie banners and Z+ premium article paywalls. * [ZeitBridge] Formatting 2022-09-21 23:24:11 +03:00			`<?php`

			`class ZeitBridge extends FeedExpander`
			`{`
			`const MAINTAINER = 'Mynacol';`
			`const NAME = 'Zeit Online Bridge';`
			`const URI = 'https://www.zeit.de/';`
			`const CACHE_TIMEOUT = 1800; // 30min`
			`const DESCRIPTION = 'Returns the full articles instead of only the intro';`
			`const PARAMETERS = [[`
			`'category' => [`
			`'name' => 'Category',`
			`'type' => 'list',`
			`'values' => [`
			`'Startseite'`
			`=> 'https://newsfeed.zeit.de/index',`
			`'Politik'`
			`=> 'https://newsfeed.zeit.de/politik/index',`
			`'Wirtschaft'`
			`=> 'https://newsfeed.zeit.de/wirtschaft/index',`
			`'Gesellschaft'`
			`=> 'https://newsfeed.zeit.de/gesellschaft/index',`
			`'Kultur'`
			`=> 'https://newsfeed.zeit.de/kultur/index',`
			`'Wissen'`
			`=> 'https://newsfeed.zeit.de/wissen/index',`
			`'Digital'`
			`=> 'https://newsfeed.zeit.de/digital/index',`
			`'ZEIT Campus ONLINE'`
			`=> 'https://newsfeed.zeit.de/campus/index',`
			`'ZEIT ONLINE Arbeit'`
			`=> 'https://newsfeed.zeit.de/arbeit/index',`
			`'ZEIT Magazin ONLINE'`
			`=> 'https://newsfeed.zeit.de/zeit-magazin/index',`
			`'Entdecken'`
			`=> 'https://newsfeed.zeit.de/entdecken/index',`
			`'Mobilität'`
			`=> 'https://newsfeed.zeit.de/mobilitaet/index',`
			`'Sport'`
			`=> 'https://newsfeed.zeit.de/sport/index',`
			`'Alle Inhalte'`
			`=> 'https://newsfeed.zeit.de/all'`
			`]`
			`],`
			`'limit' => [`
			`'name' => 'Limit',`
			`'type' => 'number',`
			`'required' => false,`
			`'title' => 'Specify number of full articles to return',`
			`'defaultValue' => 5`
			`]`
			`]];`

			`public function collectData()`
			`{`
refactor: FeedExpander::parseItem() descendants (#3744) 2023-10-13 01:25:34 +03:00			`$url = $this->getInput('category');`
			`$limit = $this->getInput('limit') ?: 5;`

			`$this->collectExpandableDatas($url, $limit);`
[ZeitBridge] Add bridge for zeit.de (#3056) * [ZeitBridge] Add bridge for zeit.de New bridge expanding the feeds of zeit.de to full-text ones. Circumvents cookie banners and Z+ premium article paywalls. * [ZeitBridge] Formatting 2022-09-21 23:24:11 +03:00			`}`

refactor: remove parent calls to parseItem (#3747) 2023-10-13 02:59:05 +03:00			`protected function parseItem(array $item)`
[ZeitBridge] Add bridge for zeit.de (#3056) * [ZeitBridge] Add bridge for zeit.de New bridge expanding the feeds of zeit.de to full-text ones. Circumvents cookie banners and Z+ premium article paywalls. * [ZeitBridge] Formatting 2022-09-21 23:24:11 +03:00			`{`
			`$item['enclosures'] = [];`

			`$headers = [`
			`'Cookie: zonconsent=' . date('Y-m-d\TH:i:s.v\Z'),`
[ZeitBridge] Revert User-Agent (#3350) The Googlebot User-Agent is no longer sufficient to circumvent the paywall. 2023-04-17 16:33:14 +03:00			`];`
[ZeitBridge] Add bridge for zeit.de (#3056) * [ZeitBridge] Add bridge for zeit.de New bridge expanding the feeds of zeit.de to full-text ones. Circumvents cookie banners and Z+ premium article paywalls. * [ZeitBridge] Formatting 2022-09-21 23:24:11 +03:00
			`// one-page article`
			`$article = getSimpleHTMLDOM($item['uri'], $headers);`
			`if ($article->find('a[href="' . $item['uri'] . '/komplettansicht"]', 0)) {`
			`$item['uri'] .= '/komplettansicht';`
			`$article = getSimpleHTMLDOM($item['uri'], $headers);`
			`}`

			`$article = defaultLinkTo($article, $item['uri']);`
			`$item = $this->parseArticle($item, $article);`

			`return $item;`
			`}`

			`private function parseArticle($item, $article)`
			`{`
			`$article = $article->find('main', 0);`

			`// remove known bad elements`
			`foreach (`
			`$article->find(`
[ZeitBridge] Remove doubled text The first two paragraphs were repeated at the end of articles. The first CSS selector filters those out (example 1). The second CSS selector removes a "Zum Anschauen benötigen wir Ihre Zustimmung" line from a poll widget. We can't load the widget successfully, therefore we should remove all embeds that seem to use javascript (example 2). 1: https://www.zeit.de/campus/2024-03/bundesregierung-wissenschaft-arbeitsvertrag-regeln 2: https://www.zeit.de/campus/2024-03/ausbildung-abgebrochen-gruende-azubi-aufruf 2024-03-11 00:21:10 +03:00			`'aside, .visually-hidden, .carousel-container, #tickaroo-liveblog, .zplus-badge, .article-heading__container--podcast, div[data-paywall], .js-embed-consent'`
[ZeitBridge] Add bridge for zeit.de (#3056) * [ZeitBridge] Add bridge for zeit.de New bridge expanding the feeds of zeit.de to full-text ones. Circumvents cookie banners and Z+ premium article paywalls. * [ZeitBridge] Formatting 2022-09-21 23:24:11 +03:00			`) as $bad`
			`) {`
			`$bad->remove();`
			`}`
			`// reload html, as remove() is buggy`
			`$article = str_get_html($article->outertext);`

			`// podcast audio, if available`
			`$podcast_src = $article->find('.article-heading__podcast audio[src]', 0);`
			`if ($podcast_src) {`
			`$item['enclosures'][] = $podcast_src->src;`
			`}`

			`// full res images`
			`foreach ($article->find('img[data-src]') as $img) {`
			`$img->src = $img->getAttribute('data-src');`
			`$item['enclosures'][] = $img->src;`
			`}`

			`// authors`
			`$authors = $article->find('[itemtype="schema.org/Person"]');`
			`if (!$authors) {`
			`$authors = $article->find('.metadata__source');`
			`}`
			`if ($authors) {`
			`$item['author'] = implode(', ', $authors);`
			`}`

			`// header image`
			`$headerimg = $article->find('*[data-ct-row="headerimage"]', 0) ?? $article->find('header', 0);`
			`if ($headerimg) {`
			`$item['content'] .= implode('', $headerimg->find('img[src], figcaption'));`
			`}`

			`// article content`
			`$pages = $article->find('.article-page');`

			`if ($pages) {`
			`foreach ($pages as $page) {`
			`$elements = $page->find('p, h2, figcaption, img[src]');`
			`$item['content'] .= implode('', $elements);`
			`}`
			`}`

			`return $item;`
			`}`
			`}`