rss-bridge/bridges/DeveloppezDotComBridge.php

<?php

class DeveloppezDotComBridge extends FeedExpander
{
    const MAINTAINER = 'Binnette';
    const NAME = 'Developpez.com Actus (FR)';
    const URI = 'https://www.developpez.com/';
    const DOMAIN = '.developpez.com/';
    const RSS_URL = 'index/rss';
    const CACHE_TIMEOUT = 1800; // 30min
    const DESCRIPTION = 'Returns complete posts from developpez.com';
    // Encodings used by Developpez.com in their articles body
    const ENCONDINGS = ['Windows-1252', 'UTF-8'];
    const PARAMETERS = [
        [
            'limit' => [
                'name' => 'Max items',
                'type' => 'number',
                'defaultValue' => 5,
            ],
            // list of the differents RSS availables
            'domain' => [
                'type' => 'list',
                'name' => 'Domaine',
                'title' => 'Chosissez un sous-domaine',
                'values' => [
                    '= Domaine principal =' => 'www',
                    '4d' => '4d',
                    'abbyy' => 'abbyy',
                    'access' => 'access',
                    'agile' => 'agile',
                    'ajax' => 'ajax',
                    'algo' => 'algo',
                    'alm' => 'alm',
                    'android' => 'android',
                    'apache' => 'apache',
                    'applications' => 'applications',
                    'arduino' => 'arduino',
                    'asm' => 'asm',
                    'asp' => 'asp',
                    'aspose' => 'aspose',
                    'bacasable' => 'bacasable',
                    'big-data' => 'big-data',
                    'bpm' => 'bpm',
                    'bsd' => 'bsd',
                    'business-intelligence' => 'business-intelligence',
                    'c' => 'c',
                    'cloud-computing' => 'cloud-computing',
                    'club' => 'club',
                    'cms' => 'cms',
                    'cpp' => 'cpp',
                    'crm' => 'crm',
                    'css' => 'css',
                    'd' => 'd',
                    'dart' => 'dart',
                    'data-science' => 'data-science',
                    'db2' => 'db2',
                    'delphi' => 'delphi',
                    'dotnet' => 'dotnet',
                    'droit' => 'droit',
                    'eclipse' => 'eclipse',
                    'edi' => 'edi',
                    'embarque' => 'embarque',
                    'emploi' => 'emploi',
                    'etudes' => 'etudes',
                    'excel' => 'excel',
                    'firebird' => 'firebird',
                    'flash' => 'flash',
                    'go' => 'go',
                    'green-it' => 'green-it',
                    'gtk' => 'gtk',
                    'hardware' => 'hardware',
                    'hpc' => 'hpc',
                    'humour' => 'humour',
                    'ibmcloud' => 'ibmcloud',
                    'intelligence-artificielle' => 'intelligence-artificielle',
                    'interbase' => 'interbase',
                    'ios' => 'ios',
                    'java' => 'java',
                    'javascript' => 'javascript',
                    'javaweb' => 'javaweb',
                    'jetbrains' => 'jetbrains',
                    'jeux' => 'jeux',
                    'kotlin' => 'kotlin',
                    'labview' => 'labview',
                    'laravel' => 'laravel',
                    'latex' => 'latex',
                    'lazarus' => 'lazarus',
                    'linux' => 'linux',
                    'mac' => 'mac',
                    'matlab' => 'matlab',
                    'megaoffice' => 'megaoffice',
                    'merise' => 'merise',
                    'microsoft' => 'microsoft',
                    'mobiles' => 'mobiles',
                    'mongodb' => 'mongodb',
                    'mysql' => 'mysql',
                    'netbeans' => 'netbeans',
                    'nodejs' => 'nodejs',
                    'nosql' => 'nosql',
                    'objective-c' => 'objective-c',
                    'office' => 'office',
                    'open-source' => 'open-source',
                    'openoffice-libreoffice' => 'openoffice-libreoffice',
                    'oracle' => 'oracle',
                    'outlook' => 'outlook',
                    'pascal' => 'pascal',
                    'perl' => 'perl',
                    'php' => 'php',
                    'portail-emploi' => 'portail-emploi',
                    'portail-projets' => 'portail-projets',
                    'postgresql' => 'postgresql',
                    'powerpoint' => 'powerpoint',
                    'preprod-emploi' => 'preprod-emploi',
                    'programmation' => 'programmation',
                    'project' => 'project',
                    'purebasic' => 'purebasic',
                    'pyqt' => 'pyqt',
                    'python' => 'python',
                    'qt-creator' => 'qt-creator',
                    'qt' => 'qt',
                    'r' => 'r',
                    'raspberry-pi' => 'raspberry-pi',
                    'reseau' => 'reseau',
                    'ruby' => 'ruby',
                    'rust' => 'rust',
                    'sap' => 'sap',
                    'sas' => 'sas',
                    'scilab' => 'scilab',
                    'securite' => 'securite',
                    'sgbd' => 'sgbd',
                    'sharepoint' => 'sharepoint',
                    'solutions-entreprise' => 'solutions-entreprise',
                    'spring' => 'spring',
                    'sqlserver' => 'sqlserver',
                    'stages' => 'stages',
                    'supervision' => 'supervision',
                    'swift' => 'swift',
                    'sybase' => 'sybase',
                    'symfony' => 'symfony',
                    'systeme' => 'systeme',
                    'talend' => 'talend',
                    'typescript' => 'typescript',
                    'uml' => 'uml',
                    'unix' => 'unix',
                    'vb' => 'vb',
                    'vba' => 'vba',
                    'virtualisation' => 'virtualisation',
                    'visualstudio' => 'visualstudio',
                    'web-semantique' => 'web-semantique',
                    'web' => 'web',
                    'webmarketing' => 'webmarketing',
                    'wind' => 'wind',
                    'windows-azure' => 'windows-azure',
                    'windows' => 'windows',
                    'windowsphone' => 'windowsphone',
                    'word' => 'word',
                    'xhtml' => 'xhtml',
                    'xml' => 'xml',
                    'zend-framework' => 'zend-framework'
                ],
            ]
        ]
    ];

    /**
     * Grabs the RSS item from Developpez.com
     */
    public function collectData()
    {
        $url = $this->getRssUrl();
        $this->collectExpandableDatas($url, 20);
    }

    /**
     * Parse the content of every RSS item. And will try to get the full article
     * pointed by the item URL intead of the default abstract.
     */
    protected function parseItem(array $item)
    {
        if (count($this->items) >= $this->getInput('limit')) {
            return null;
        }

        // There is a bug in Developpez RSS, coma are writtent as '~?' in the
        // title, so I have to fix it manually
        $item['title'] = $this->fixComaInTitle($item['title']);

        // We get the content of the full article behind the RSS item URL
        $articleHTMLContent = getSimpleHTMLDOMCached($item['uri']);

        // Here we call our custom parser
        $fullText = $this->extractFullText($articleHTMLContent);
        if (!is_null($fullText)) {
            // if we manage to parse the page behind the url of the RSS item
            // then we set it as the new content. Otherwise we keep the default
            // content to avoid RSS Bridge to return an empty item
            $item['content'] = $fullText;
        }

        // Now we will attach video url in item
        $videosUrl = $this->getAllVideoUrl($articleHTMLContent);
        if (!empty($videosUrl)) {
            $item['enclosures'] = array_merge($item['enclosures'], $videosUrl);
        }

        // Now we can look for the blog writer/creator
        $author = $articleHTMLContent->find('[itemprop="creator"]', 0);
        if (!empty($author)) {
            $item['author'] = $author->outertext;
        }

        return $item;
    }

    /**
     * Return the RSS url for selected domain
     */
    private function getRssUrl()
    {
        $domain = $this->getInput('domain');
        if (!empty($domain)) {
            return 'https://' . $domain . self::DOMAIN . self::RSS_URL;
        }

        return self::URI . self::RSS_URL;
    }

    /**
     * Replace '~?' by a proper coma ','
     */
    private function fixComaInTitle($txt)
    {
        return str_replace('~?', ',', $txt);
    }

    /**
     * Return the full article pointed by the url in the RSS item
     * Since Developpez.com only provides a short abstract of the article, we
     * use the url to retrieve the complete article and return it as the content
     */
    private function extractFullText($articleHTMLContent)
    {
        // All blog entry contains a div with the class 'content'. This div
        // contains the complete blog article. But the RSS can also return
        // announcement and not a blog article. So the next if, should take
        // care of the "non blog" entry
        $divArticleEntry = $articleHTMLContent->find('div.content', 0);
        if (is_null($divArticleEntry)) {
            // Didn't find the div with class content. It is probably not a blog
            // entry. It is probably just an announcement for an ebook, a PDF,
            // etc. So we can use the default RSS item content.
            return null;
        }

        // The following code is a bit hacky, but I really manage to get the
        // full content of articles without any encoding issues. What is very
        // weird and ugly in Developpez.com is the fact the some paragraphs of
        // the article will be encoded as UTF-8 and some other paragraphs will
        // be encoded as Windows-1252. So we can NOT decode the full article
        // with only one encoding. We have to check every paragraph and
        // determine its encoding

        // This contains all the 'paragraphs' of the article. It includes the
        // pictures, the text and the links at the bottom of the article
        $paragraphs = $divArticleEntry->nodes;
        // This will store the complete decoded content
        $fullText = '';

        // For each paragraph, we will identify the encoding, then decode it
        // and finally store the decoded content in $text
        foreach ($paragraphs as $paragraph) {
            // We have to recreate a new DOM document from the current node
            // otherwise the find function will look in the complet article and
            // not only in the current paragraph. This is an ugly behavior of
            // the library Simple HTML DOM Parser...
            $html = str_get_html($paragraph->outertext);
            $fullText .= $this->decodeParagraph($html);
        }

        // Finally we return the full 'well' enconded content of the article
        return $fullText;
    }

    /**
     *
     */
    private function decodeParagraph($p)
    {
        // First we check if this paragraph is a video
        $videoUrl = $this->getVideoUrl($p);
        if (!empty($videoUrl)) {
            // If this is a video, we just return a link to the video
            // &#128250; => 🎞️
            return  '<p>
						<b>&#128250; <a href="' . $videoUrl . '">Voir la vidéo</a></b>
					</p>';
        }

        // We take outertext to get the complete paragraph not only the text
        // inside it. That way we still graph block <img> and so on.
        $pTxt = $p->outertext;
        // This will store the decoded text if we manage to decode it
        $decodedTxt = '';

        // This is the only way to properly decode each paragraph. I tried
        // many stuffs but this is the only working way I found.
        foreach (self::ENCONDINGS as $enc) {
            // We check the encoding of the current paragraph
            if (mb_check_encoding($pTxt, $enc)) {
                // If the encoding is well recognized, we can convert from
                // this encoding to UTF-8
                $decodedTxt = iconv($enc, 'UTF-8', $pTxt);
            }
        }

        // We should not trim the strings to avoid the <a> to be glued to the
        // text like: the software<a href="...">started</a>to...
        if (!empty($decodedTxt)) {
            // We manage to decode the text, so we take the decoded version
            return $this->formatParagraph($decodedTxt);
        } else {
            // Otherwise we take the non decoded version and hope it will
            // be displayed not too ugly in the fulltext content
            return $this->formatParagraph($pTxt);
        }
    }

    /**
     * Return true in $txt is a HTML tag and not plain text
     */
    private function isHtmlTagNotTxt($txt)
    {
        if ($txt === '') {
            return false;
        }
        $html = str_get_html($txt);
        return $html && $html->root && count($html->root->children) > 0;
    }

    /**
     * Will add a space before paragraph when needed
     */
    private function formatParagraph($txt)
    {
        // If the paragraph is an html tag, we add a space before
        if ($this->isHtmlTagNotTxt($txt)) {
            // the first element is an html tag and not a text, so we can add a
            // space before it
            return ' ' . $txt;
        }
        // If the text start with word (not punctation), we had a space
        $pattern = '/^\w/';
        if (preg_match($pattern, $txt)) {
            return ' ' . $txt;
        }
        return $txt;
    }

    /**
     * Retrieve all video url in the article
     */
    private function getAllVideoUrl($item)
    {
        // Array of video url
        $url = [];

        // Developpez use a div with the class video-container
        $divsVideo = $item->find('div.video-container');
        if (empty($divsVideo)) {
            return $url;
        }

        // get the url of the video
        foreach ($divsVideo as $div) {
            $html = str_get_html($div->outertext);
            $url[] = $this->getVideoUrl($html);
        }

        return $url;
    }

    /**
     * Retrieve URL video. We have to check for the src of an iframe
     * Work for Youtube. Will have to test for other video platform
     */
    private function getVideoUrl($p)
    {
        $divVideo = $p->find('div.video-container', 0);
        if (empty($divVideo)) {
            return null;
        }
        $iframe = $divVideo->find('iframe', 0);
        if (empty($iframe)) {
            return null;
        }
        $src = trim($iframe->getAttribute('src'));
        if (empty($src)) {
            return null;
        }
        if (str_starts_with($src, '//')) {
            $src = 'https:' . $src;
        }
        return $src;
    }
}
Developpez.com (FR) Bridge 2014-07-14 21:41:09 +04:00			`<?php`

Full rewrite of bridge DeveloppezDotCom (#2689) 2022-05-08 04:38:33 +03:00			`class DeveloppezDotComBridge extends FeedExpander`
			`{`
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`const MAINTAINER = 'Binnette';`
			`const NAME = 'Developpez.com Actus (FR)';`
			`const URI = 'https://www.developpez.com/';`
			`const DOMAIN = '.developpez.com/';`
			`const RSS_URL = 'index/rss';`
			`const CACHE_TIMEOUT = 1800; // 30min`
			`const DESCRIPTION = 'Returns complete posts from developpez.com';`
			`// Encodings used by Developpez.com in their articles body`
			`const ENCONDINGS = ['Windows-1252', 'UTF-8'];`
			`const PARAMETERS = [`
			`[`
			`'limit' => [`
			`'name' => 'Max items',`
			`'type' => 'number',`
			`'defaultValue' => 5,`
			`],`
			`// list of the differents RSS availables`
			`'domain' => [`
			`'type' => 'list',`
			`'name' => 'Domaine',`
			`'title' => 'Chosissez un sous-domaine',`
			`'values' => [`
			`'= Domaine principal =' => 'www',`
			`'4d' => '4d',`
			`'abbyy' => 'abbyy',`
			`'access' => 'access',`
			`'agile' => 'agile',`
			`'ajax' => 'ajax',`
			`'algo' => 'algo',`
			`'alm' => 'alm',`
			`'android' => 'android',`
			`'apache' => 'apache',`
			`'applications' => 'applications',`
			`'arduino' => 'arduino',`
			`'asm' => 'asm',`
			`'asp' => 'asp',`
			`'aspose' => 'aspose',`
			`'bacasable' => 'bacasable',`
			`'big-data' => 'big-data',`
			`'bpm' => 'bpm',`
			`'bsd' => 'bsd',`
			`'business-intelligence' => 'business-intelligence',`
			`'c' => 'c',`
			`'cloud-computing' => 'cloud-computing',`
			`'club' => 'club',`
			`'cms' => 'cms',`
			`'cpp' => 'cpp',`
			`'crm' => 'crm',`
			`'css' => 'css',`
			`'d' => 'd',`
			`'dart' => 'dart',`
			`'data-science' => 'data-science',`
			`'db2' => 'db2',`
			`'delphi' => 'delphi',`
			`'dotnet' => 'dotnet',`
			`'droit' => 'droit',`
			`'eclipse' => 'eclipse',`
			`'edi' => 'edi',`
			`'embarque' => 'embarque',`
			`'emploi' => 'emploi',`
			`'etudes' => 'etudes',`
			`'excel' => 'excel',`
			`'firebird' => 'firebird',`
			`'flash' => 'flash',`
			`'go' => 'go',`
			`'green-it' => 'green-it',`
			`'gtk' => 'gtk',`
			`'hardware' => 'hardware',`
			`'hpc' => 'hpc',`
			`'humour' => 'humour',`
			`'ibmcloud' => 'ibmcloud',`
			`'intelligence-artificielle' => 'intelligence-artificielle',`
			`'interbase' => 'interbase',`
			`'ios' => 'ios',`
			`'java' => 'java',`
			`'javascript' => 'javascript',`
			`'javaweb' => 'javaweb',`
			`'jetbrains' => 'jetbrains',`
			`'jeux' => 'jeux',`
			`'kotlin' => 'kotlin',`
			`'labview' => 'labview',`
			`'laravel' => 'laravel',`
			`'latex' => 'latex',`
			`'lazarus' => 'lazarus',`
			`'linux' => 'linux',`
			`'mac' => 'mac',`
			`'matlab' => 'matlab',`
			`'megaoffice' => 'megaoffice',`
			`'merise' => 'merise',`
			`'microsoft' => 'microsoft',`
			`'mobiles' => 'mobiles',`
			`'mongodb' => 'mongodb',`
			`'mysql' => 'mysql',`
			`'netbeans' => 'netbeans',`
			`'nodejs' => 'nodejs',`
			`'nosql' => 'nosql',`
			`'objective-c' => 'objective-c',`
			`'office' => 'office',`
			`'open-source' => 'open-source',`
			`'openoffice-libreoffice' => 'openoffice-libreoffice',`
			`'oracle' => 'oracle',`
			`'outlook' => 'outlook',`
			`'pascal' => 'pascal',`
			`'perl' => 'perl',`
			`'php' => 'php',`
			`'portail-emploi' => 'portail-emploi',`
			`'portail-projets' => 'portail-projets',`
			`'postgresql' => 'postgresql',`
			`'powerpoint' => 'powerpoint',`
			`'preprod-emploi' => 'preprod-emploi',`
			`'programmation' => 'programmation',`
			`'project' => 'project',`
			`'purebasic' => 'purebasic',`
			`'pyqt' => 'pyqt',`
			`'python' => 'python',`
			`'qt-creator' => 'qt-creator',`
			`'qt' => 'qt',`
			`'r' => 'r',`
			`'raspberry-pi' => 'raspberry-pi',`
			`'reseau' => 'reseau',`
			`'ruby' => 'ruby',`
			`'rust' => 'rust',`
			`'sap' => 'sap',`
			`'sas' => 'sas',`
			`'scilab' => 'scilab',`
			`'securite' => 'securite',`
			`'sgbd' => 'sgbd',`
			`'sharepoint' => 'sharepoint',`
			`'solutions-entreprise' => 'solutions-entreprise',`
			`'spring' => 'spring',`
			`'sqlserver' => 'sqlserver',`
			`'stages' => 'stages',`
			`'supervision' => 'supervision',`
			`'swift' => 'swift',`
			`'sybase' => 'sybase',`
			`'symfony' => 'symfony',`
			`'systeme' => 'systeme',`
			`'talend' => 'talend',`
			`'typescript' => 'typescript',`
			`'uml' => 'uml',`
			`'unix' => 'unix',`
			`'vb' => 'vb',`
			`'vba' => 'vba',`
			`'virtualisation' => 'virtualisation',`
			`'visualstudio' => 'visualstudio',`
			`'web-semantique' => 'web-semantique',`
			`'web' => 'web',`
			`'webmarketing' => 'webmarketing',`
			`'wind' => 'wind',`
			`'windows-azure' => 'windows-azure',`
			`'windows' => 'windows',`
			`'windowsphone' => 'windowsphone',`
			`'word' => 'word',`
			`'xhtml' => 'xhtml',`
			`'xml' => 'xml',`
			`'zend-framework' => 'zend-framework'`
			`],`
			`]`
			`]`
			`];`
Full rewrite of bridge DeveloppezDotCom (#2689) 2022-05-08 04:38:33 +03:00
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`/**`
			`* Grabs the RSS item from Developpez.com`
			`*/`
			`public function collectData()`
			`{`
			`$url = $this->getRssUrl();`
			`$this->collectExpandableDatas($url, 20);`
			`}`
Premier lot de bridges utilisant le premier système. Signed-off-by: teromene <teromene@teromene.fr> 2015-11-04 01:28:44 +03:00
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`/**`
			`* Parse the content of every RSS item. And will try to get the full article`
			`* pointed by the item URL intead of the default abstract.`
			`*/`
refactor: remove parent calls to parseItem (#3747) 2023-10-13 02:59:05 +03:00			`protected function parseItem(array $item)`
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`{`
			`if (count($this->items) >= $this->getInput('limit')) {`
			`return null;`
			`}`
[bridges] Change to extend from FeedExpander 2016-09-05 19:43:56 +03:00
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`// There is a bug in Developpez RSS, coma are writtent as '~?' in the`
			`// title, so I have to fix it manually`
			`$item['title'] = $this->fixComaInTitle($item['title']);`
Full rewrite of bridge DeveloppezDotCom (#2689) 2022-05-08 04:38:33 +03:00
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`// We get the content of the full article behind the RSS item URL`
			`$articleHTMLContent = getSimpleHTMLDOMCached($item['uri']);`
Full rewrite of bridge DeveloppezDotCom (#2689) 2022-05-08 04:38:33 +03:00
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`// Here we call our custom parser`
			`$fullText = $this->extractFullText($articleHTMLContent);`
			`if (!is_null($fullText)) {`
			`// if we manage to parse the page behind the url of the RSS item`
			`// then we set it as the new content. Otherwise we keep the default`
			`// content to avoid RSS Bridge to return an empty item`
			`$item['content'] = $fullText;`
			`}`
Full rewrite of bridge DeveloppezDotCom (#2689) 2022-05-08 04:38:33 +03:00
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`// Now we will attach video url in item`
			`$videosUrl = $this->getAllVideoUrl($articleHTMLContent);`
			`if (!empty($videosUrl)) {`
			`$item['enclosures'] = array_merge($item['enclosures'], $videosUrl);`
			`}`
Full rewrite of bridge DeveloppezDotCom (#2689) 2022-05-08 04:38:33 +03:00
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`// Now we can look for the blog writer/creator`
			`$author = $articleHTMLContent->find('[itemprop="creator"]', 0);`
			`if (!empty($author)) {`
			`$item['author'] = $author->outertext;`
			`}`
Full rewrite of bridge DeveloppezDotCom (#2689) 2022-05-08 04:38:33 +03:00
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`return $item;`
			`}`
Full rewrite of bridge DeveloppezDotCom (#2689) 2022-05-08 04:38:33 +03:00
refactor: FeedExpander::parseItem() descendants (#3744) 2023-10-13 01:25:34 +03:00			`/**`
			`* Return the RSS url for selected domain`
			`*/`
			`private function getRssUrl()`
			`{`
			`$domain = $this->getInput('domain');`
			`if (!empty($domain)) {`
			`return 'https://' . $domain . self::DOMAIN . self::RSS_URL;`
			`}`

			`return self::URI . self::RSS_URL;`
			`}`

Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`/**`
			`* Replace '~?' by a proper coma ','`
			`*/`
			`private function fixComaInTitle($txt)`
			`{`
			`return str_replace('~?', ',', $txt);`
			`}`
[bridges] Change to extend from FeedExpander 2016-09-05 19:43:56 +03:00
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`/**`
			`* Return the full article pointed by the url in the RSS item`
			`* Since Developpez.com only provides a short abstract of the article, we`
			`* use the url to retrieve the complete article and return it as the content`
			`*/`
			`private function extractFullText($articleHTMLContent)`
			`{`
			`// All blog entry contains a div with the class 'content'. This div`
			`// contains the complete blog article. But the RSS can also return`
			`// announcement and not a blog article. So the next if, should take`
			`// care of the "non blog" entry`
			`$divArticleEntry = $articleHTMLContent->find('div.content', 0);`
			`if (is_null($divArticleEntry)) {`
			`// Didn't find the div with class content. It is probably not a blog`
			`// entry. It is probably just an announcement for an ebook, a PDF,`
			`// etc. So we can use the default RSS item content.`
			`return null;`
			`}`
Full rewrite of bridge DeveloppezDotCom (#2689) 2022-05-08 04:38:33 +03:00
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`// The following code is a bit hacky, but I really manage to get the`
			`// full content of articles without any encoding issues. What is very`
			`// weird and ugly in Developpez.com is the fact the some paragraphs of`
			`// the article will be encoded as UTF-8 and some other paragraphs will`
			`// be encoded as Windows-1252. So we can NOT decode the full article`
			`// with only one encoding. We have to check every paragraph and`
			`// determine its encoding`
Full rewrite of bridge DeveloppezDotCom (#2689) 2022-05-08 04:38:33 +03:00
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`// This contains all the 'paragraphs' of the article. It includes the`
			`// pictures, the text and the links at the bottom of the article`
			`$paragraphs = $divArticleEntry->nodes;`
			`// This will store the complete decoded content`
			`$fullText = '';`
Full rewrite of bridge DeveloppezDotCom (#2689) 2022-05-08 04:38:33 +03:00
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`// For each paragraph, we will identify the encoding, then decode it`
			`// and finally store the decoded content in $text`
			`foreach ($paragraphs as $paragraph) {`
			`// We have to recreate a new DOM document from the current node`
			`// otherwise the find function will look in the complet article and`
			`// not only in the current paragraph. This is an ugly behavior of`
			`// the library Simple HTML DOM Parser...`
			`$html = str_get_html($paragraph->outertext);`
			`$fullText .= $this->decodeParagraph($html);`
			`}`
Full rewrite of bridge DeveloppezDotCom (#2689) 2022-05-08 04:38:33 +03:00
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`// Finally we return the full 'well' enconded content of the article`
			`return $fullText;`
			`}`
Full rewrite of bridge DeveloppezDotCom (#2689) 2022-05-08 04:38:33 +03:00
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`/**`
			`*`
			`*/`
			`private function decodeParagraph($p)`
			`{`
			`// First we check if this paragraph is a video`
			`$videoUrl = $this->getVideoUrl($p);`
			`if (!empty($videoUrl)) {`
			`// If this is a video, we just return a link to the video`
			`// 📺 => 🎞️`
			`return '<p>`
Full rewrite of bridge DeveloppezDotCom (#2689) 2022-05-08 04:38:33 +03:00			`<b>📺 <a href="' . $videoUrl . '">Voir la vidéo</a></b>`
			`</p>';`
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`}`
Developpez.com (FR) Bridge 2014-07-14 21:41:09 +04:00
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`// We take outertext to get the complete paragraph not only the text`
			`// inside it. That way we still graph block <img> and so on.`
			`$pTxt = $p->outertext;`
			`// This will store the decoded text if we manage to decode it`
			`$decodedTxt = '';`
microsoft quotes and UTF8 encoding fixed for Developpez.com 2014-07-16 04:31:54 +04:00
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`// This is the only way to properly decode each paragraph. I tried`
			`// many stuffs but this is the only working way I found.`
			`foreach (self::ENCONDINGS as $enc) {`
			`// We check the encoding of the current paragraph`
			`if (mb_check_encoding($pTxt, $enc)) {`
			`// If the encoding is well recognized, we can convert from`
			`// this encoding to UTF-8`
			`$decodedTxt = iconv($enc, 'UTF-8', $pTxt);`
			`}`
			`}`
Full rewrite of bridge DeveloppezDotCom (#2689) 2022-05-08 04:38:33 +03:00
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`// We should not trim the strings to avoid the <a> to be glued to the`
			`// text like: the software<a href="...">started</a>to...`
			`if (!empty($decodedTxt)) {`
			`// We manage to decode the text, so we take the decoded version`
			`return $this->formatParagraph($decodedTxt);`
			`} else {`
			`// Otherwise we take the non decoded version and hope it will`
			`// be displayed not too ugly in the fulltext content`
			`return $this->formatParagraph($pTxt);`
			`}`
			`}`
microsoft quotes and UTF8 encoding fixed for Developpez.com 2014-07-16 04:31:54 +04:00
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`/**`
			`* Return true in $txt is a HTML tag and not plain text`
			`*/`
			`private function isHtmlTagNotTxt($txt)`
			`{`
refactor: FeedExpander::parseItem() descendants (#3744) 2023-10-13 01:25:34 +03:00			`if ($txt === '') {`
			`return false;`
			`}`
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`$html = str_get_html($txt);`
			`return $html && $html->root && count($html->root->children) > 0;`
			`}`
Full rewrite of bridge DeveloppezDotCom (#2689) 2022-05-08 04:38:33 +03:00
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`/**`
			`* Will add a space before paragraph when needed`
			`*/`
			`private function formatParagraph($txt)`
			`{`
			`// If the paragraph is an html tag, we add a space before`
			`if ($this->isHtmlTagNotTxt($txt)) {`
			`// the first element is an html tag and not a text, so we can add a`
			`// space before it`
			`return ' ' . $txt;`
			`}`
			`// If the text start with word (not punctation), we had a space`
			`$pattern = '/^\w/';`
			`if (preg_match($pattern, $txt)) {`
			`return ' ' . $txt;`
			`}`
			`return $txt;`
			`}`
Full rewrite of bridge DeveloppezDotCom (#2689) 2022-05-08 04:38:33 +03:00
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`/**`
			`* Retrieve all video url in the article`
			`*/`
			`private function getAllVideoUrl($item)`
			`{`
			`// Array of video url`
			`$url = [];`
Full rewrite of bridge DeveloppezDotCom (#2689) 2022-05-08 04:38:33 +03:00
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`// Developpez use a div with the class video-container`
			`$divsVideo = $item->find('div.video-container');`
			`if (empty($divsVideo)) {`
			`return $url;`
			`}`
Full rewrite of bridge DeveloppezDotCom (#2689) 2022-05-08 04:38:33 +03:00
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`// get the url of the video`
			`foreach ($divsVideo as $div) {`
			`$html = str_get_html($div->outertext);`
			`$url[] = $this->getVideoUrl($html);`
			`}`
Full rewrite of bridge DeveloppezDotCom (#2689) 2022-05-08 04:38:33 +03:00
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`return $url;`
			`}`
Full rewrite of bridge DeveloppezDotCom (#2689) 2022-05-08 04:38:33 +03:00
Reformat codebase v4 (#2872) Reformat code base to PSR12 Co-authored-by: rssbridge <noreply@github.com> 2022-07-01 16:10:30 +03:00			`/**`
			`* Retrieve URL video. We have to check for the src of an iframe`
			`* Work for Youtube. Will have to test for other video platform`
			`*/`
			`private function getVideoUrl($p)`
			`{`
			`$divVideo = $p->find('div.video-container', 0);`
			`if (empty($divVideo)) {`
			`return null;`
			`}`
			`$iframe = $divVideo->find('iframe', 0);`
			`if (empty($iframe)) {`
			`return null;`
			`}`
			`$src = trim($iframe->getAttribute('src'));`
			`if (empty($src)) {`
			`return null;`
			`}`
			`if (str_starts_with($src, '//')) {`
			`$src = 'https:' . $src;`
			`}`
			`return $src;`
			`}`
Developpez.com (FR) Bridge 2014-07-14 21:41:09 +04:00			`}`