From 829d570f8e4f6b38f04b2fac4a3da387e64bf403 Mon Sep 17 00:00:00 2001
From: "Quentin B." <quent1-fr@users.noreply.github.com>
Date: Thu, 8 Aug 2024 00:57:40 +0200
Subject: [PATCH] [CentreFranceBridge] Add bridge (#4189)

* [CentreFranceBridge] Add bridge

* [CentreFranceBridge] Fix bridge

* [CentreFranceBridge] Fix bridge

* [CentreFranceBridge] Improved icon choice

* [CentreFranceBridge] Fetch additional data from articles

* [CentreFranceBridge] New parameter to allow client to control how many articles to fetch

* [CentreFranceBridge] Improve bridge name based on existing parameters

* [CentreFranceBridge] Fixed some edge cases

* refactor: reorder

* fix

---------

Co-authored-by: Dag <me@dvikan.no>
---
 bridges/CentreFranceBridge.php        | 279 ++++++++++++++++++++++++++
 lib/contents.php                      |   4 +-
 lib/simplehtmldom/simple_html_dom.php |   5 -
 3 files changed, 281 insertions(+), 7 deletions(-)
 create mode 100644 bridges/CentreFranceBridge.php

diff --git a/bridges/CentreFranceBridge.php b/bridges/CentreFranceBridge.php
new file mode 100644
index 00000000..a6dea227
--- /dev/null
+++ b/bridges/CentreFranceBridge.php
@@ -0,0 +1,279 @@
+<?php
+
+class CentreFranceBridge extends BridgeAbstract
+{
+    const NAME = 'Centre France Newspapers';
+    const URI = 'https://www.centrefrance.com/';
+    const DESCRIPTION = 'Common bridge for all Centre France group newspapers.';
+    const CACHE_TIMEOUT = 7200; // 2h
+    const MAINTAINER = 'quent1';
+    const PARAMETERS = [
+        'global' => [
+            'newspaper' => [
+                'name' => 'Newspaper',
+                'type' => 'list',
+                'values' => [
+                    'La Montagne' => 'lamontagne.fr',
+                    'Le Populaire du Centre' => 'lepopulaire.fr',
+                    'La République du Centre' => 'larep.fr',
+                    'Le Berry Républicain' => 'leberry.fr',
+                    'L\'Yonne Républicaine' => 'lyonne.fr',
+                    'L\'Écho Républicain' => 'lechorepublicain.fr',
+                    'Le Journal du Centre' => 'lejdc.fr',
+                    'L\'Éveil de la Haute-Loire' => 'leveil.fr',
+                    'Le Pays' => 'le-pays.fr'
+                ]
+            ],
+            'remove-reserved-for-subscribers-articles' => [
+                'name' => 'Remove reserved for subscribers articles',
+                'type' => 'checkbox',
+                'title' => 'Filter out articles that are only available to subscribers'
+            ],
+            'limit' => [
+                'name' => 'Limit',
+                'type' => 'number',
+                'title' => 'How many articles to fetch. 0 to disable.',
+                'required' => true,
+                'defaultValue' => 15
+            ]
+        ],
+        'Local news' => [
+            'locality-slug' => [
+                'name' => 'Locality slug',
+                'type' => 'text',
+                'required' => false,
+                'title' => 'Fetch articles for a specific locality. If not set, headlines from the front page will be used instead.',
+                'exampleValue' => 'moulins-03000'
+            ],
+        ]
+    ];
+
+    public function collectData()
+    {
+        $value = $this->getInput('limit');
+        if (is_numeric($value) && (int)$value >= 0) {
+            $limit = $value;
+        } else {
+            $limit = static::PARAMETERS['global']['limit']['defaultValue'];
+        }
+
+        if (empty($this->getInput('newspaper'))) {
+            return;
+        }
+
+        $localitySlug = $this->getInput('locality-slug') ?? '';
+        $alreadyFoundArticlesURIs = [];
+
+        $newspaperUrl = 'https://www.' . $this->getInput('newspaper') . '/' . $localitySlug . '/';
+        $html = getSimpleHTMLDOM($newspaperUrl);
+
+        // Articles are detected through their titles
+        foreach ($html->find('.c-titre') as $articleTitleDOMElement) {
+            $articleLinkDOMElement = $articleTitleDOMElement->find('a', 0);
+
+            // Ignore articles in the « Les + partagés » block
+            if (strpos($articleLinkDOMElement->id, 'les_plus_partages') !== false) {
+                continue;
+            }
+
+            $articleURI = $articleLinkDOMElement->href;
+
+            // If the URI has already been processed, ignore it
+            if (in_array($articleURI, $alreadyFoundArticlesURIs, true)) {
+                continue;
+            }
+
+            // If news are filtered for a specific locality, filter out article for other localities
+            if ($localitySlug !== '' && !str_contains($articleURI, $localitySlug)) {
+                continue;
+            }
+
+            $articleTitle = '';
+
+            // If article is reserved for subscribers
+            if ($articleLinkDOMElement->find('span.premium-picto', 0)) {
+                if ($this->getInput('remove-reserved-for-subscribers-articles') === true) {
+                    continue;
+                }
+
+                $articleTitle .= '🔒 ';
+            }
+
+            $articleTitleDOMElement = $articleLinkDOMElement->find('span[data-tb-title]', 0);
+            if ($articleTitleDOMElement === null) {
+                continue;
+            }
+
+            if ($limit > 0 && count($this->items) === $limit) {
+                break;
+            }
+
+            $articleTitle .= $articleLinkDOMElement->find('span[data-tb-title]', 0)->innertext;
+            $articleFullURI = urljoin('https://www.' . $this->getInput('newspaper') . '/', $articleURI);
+
+            $item = [
+                'title' => $articleTitle,
+                'uri' => $articleFullURI,
+                ...$this->collectArticleData($articleFullURI)
+            ];
+            $this->items[] = $item;
+
+            $alreadyFoundArticlesURIs[] = $articleURI;
+        }
+    }
+
+    private function collectArticleData($uri): array
+    {
+        $html = getSimpleHTMLDOMCached($uri, 86400 * 90); // 90d
+
+        $item = [
+            'enclosures' => [],
+        ];
+
+        $articleInformations = $html->find('.c-article-informations p');
+        if (is_array($articleInformations) && $articleInformations !== []) {
+            $authorPosition = 1;
+
+            // Article publication date
+            if (preg_match('/(\d{2})\/(\d{2})\/(\d{4})( à (\d{2})h(\d{2}))?/', $articleInformations[0]->innertext, $articleDateParts) > 0) {
+                $articleDate = new \DateTime('midnight');
+                $articleDate->setDate($articleDateParts[3], $articleDateParts[2], $articleDateParts[1]);
+
+                if (count($articleDateParts) === 7) {
+                    $articleDate->setTime($articleDateParts[5], $articleDateParts[6]);
+                }
+
+                $item['timestamp'] = $articleDate->getTimestamp();
+            }
+
+            // Article update date
+            if (count($articleInformations) >= 2 && preg_match('/(\d{2})\/(\d{2})\/(\d{4})( à (\d{2})h(\d{2}))?/', $articleInformations[1]->innertext, $articleDateParts) > 0) {
+                $authorPosition = 2;
+
+                $articleDate = new \DateTime('midnight');
+                $articleDate->setDate($articleDateParts[3], $articleDateParts[2], $articleDateParts[1]);
+
+                if (count($articleDateParts) === 7) {
+                    $articleDate->setTime($articleDateParts[5], $articleDateParts[6]);
+                }
+
+                $item['timestamp'] = $articleDate->getTimestamp();
+            }
+
+            if (count($articleInformations) === ($authorPosition + 1)) {
+                $item['author'] = $articleInformations[$authorPosition]->innertext;
+            }
+        }
+
+        $articleContent = $html->find('.b-article .contenu > *');
+        if (is_array($articleContent)) {
+            $item['content'] = '';
+
+            foreach ($articleContent as $contentPart) {
+                if (in_array($contentPart->getAttribute('id'), ['cf-audio-player', 'poool-widget'], true)) {
+                    continue;
+                }
+
+                $articleHiddenParts = $contentPart->find('.bloc, .p402_hide');
+                if (is_array($articleHiddenParts)) {
+                    foreach ($articleHiddenParts as $articleHiddenPart) {
+                        $contentPart->removeChild($articleHiddenPart);
+                    }
+                }
+
+                $item['content'] .= $contentPart->innertext;
+            }
+        }
+
+        $articleIllustration  = $html->find('.photo-wrapper .photo-box img');
+        if (is_array($articleIllustration) && count($articleIllustration) === 1) {
+            $item['enclosures'][] = $articleIllustration[0]->getAttribute('src');
+        }
+
+        $articleAudio = $html->find('#cf-audio-player-container audio');
+        if (is_array($articleAudio) && count($articleAudio) === 1) {
+            $item['enclosures'][] = $articleAudio[0]->getAttribute('src');
+        }
+
+        $articleTags = $html->find('.b-article > ul.c-tags > li > a.t-simple');
+        if (is_array($articleTags)) {
+            $item['categories'] = array_map(static fn ($articleTag) => $articleTag->innertext, $articleTags);
+        }
+
+        $explode = explode('_', $uri);
+        $array_reverse = array_reverse($explode);
+        $string = $array_reverse[0];
+        $uid = rtrim($string, '/');
+        if (is_numeric($uid)) {
+            $item['uid'] = $uid;
+        }
+
+        // If the article is a "grand format", we use another parsing strategy
+        if ($item['content'] === '' && $html->find('article') !== []) {
+            $articleContent = $html->find('article > section');
+            foreach ($articleContent as $contentPart) {
+                if ($contentPart->find('#journo') !== []) {
+                    $item['author'] = $contentPart->find('#journo')->innertext;
+                    continue;
+                }
+
+                $item['content'] .= $contentPart->innertext;
+            }
+        }
+
+        $item['content'] = str_replace('<span class="p-premium">premium</span>', '🔒', $item['content']);
+        $item['content'] = trim($item['content']);
+
+        return $item;
+    }
+
+    public function getName()
+    {
+        if (empty($this->getInput('newspaper'))) {
+            return static::NAME;
+        }
+
+        $newspaperNameByDomain = array_flip(self::PARAMETERS['global']['newspaper']['values']);
+        if (!isset($newspaperNameByDomain[$this->getInput('newspaper')])) {
+            return static::NAME;
+        }
+
+        $completeTitle = $newspaperNameByDomain[$this->getInput('newspaper')];
+
+        if (!empty($this->getInput('locality-slug'))) {
+            $localityName = explode('-', $this->getInput('locality-slug'));
+            array_pop($localityName);
+            $completeTitle .= ' ' . ucfirst(implode('-', $localityName));
+        }
+
+        return $completeTitle;
+    }
+
+    public function getIcon()
+    {
+        if (empty($this->getInput('newspaper'))) {
+            return static::URI . '/favicon.ico';
+        }
+
+        return 'https://www.' . $this->getInput('newspaper') . '/favicon.ico';
+    }
+
+    public function detectParameters($url)
+    {
+        $regex = '/^(https?:\/\/)?(www\.)?([a-z-]+\.fr)(\/)?([a-z-]+-[0-9]{5})?(\/)?$/';
+        $url = strtolower($url);
+
+        if (preg_match($regex, $url, $urlMatches) === 0) {
+            return null;
+        }
+
+        if (!in_array($urlMatches[3], self::PARAMETERS['global']['newspaper']['values'], true)) {
+            return null;
+        }
+
+        return [
+            'newspaper' => $urlMatches[3],
+            'locality-slug' => empty($urlMatches[5]) ? null : $urlMatches[5]
+        ];
+    }
+}
diff --git a/lib/contents.php b/lib/contents.php
index 893a3512..cc9542a9 100644
--- a/lib/contents.php
+++ b/lib/contents.php
@@ -142,7 +142,6 @@ function getContents(
  * when returning plaintext.
  * @param string $defaultSpanText Specifies the replacement text for `<span />`
  * tags when returning plaintext.
- * @return false|simple_html_dom Contents as simplehtmldom object.
  */
 function getSimpleHTMLDOM(
     $url,
@@ -154,11 +153,12 @@ function getSimpleHTMLDOM(
     $stripRN = true,
     $defaultBRText = DEFAULT_BR_TEXT,
     $defaultSpanText = DEFAULT_SPAN_TEXT
-) {
+): \simple_html_dom {
     $html = getContents($url, $header ?? [], $opts ?? []);
     if ($html === '') {
         throw new \Exception('Unable to parse dom because the http response was the empty string');
     }
+
     return str_get_html(
         $html,
         $lowercase,
diff --git a/lib/simplehtmldom/simple_html_dom.php b/lib/simplehtmldom/simple_html_dom.php
index 3fc95760..170f6fb0 100644
--- a/lib/simplehtmldom/simple_html_dom.php
+++ b/lib/simplehtmldom/simple_html_dom.php
@@ -118,11 +118,6 @@ function str_get_html(
         throw new \Exception('Refusing to parse too big input');
     }
 
-	if (empty($str) || strlen($str) > MAX_FILE_SIZE) {
-		$dom->clear();
-		return false;
-	}
-
 	return $dom->load($str, $lowercase, $stripRN);
 }