mirror of
https://github.com/RSS-Bridge/rss-bridge.git
synced 2024-12-18 17:10:29 +03:00
406 lines
15 KiB
PHP
406 lines
15 KiB
PHP
<?php
|
|
|
|
class DeveloppezDotComBridge extends FeedExpander
|
|
{
|
|
const MAINTAINER = 'Binnette';
|
|
const NAME = 'Developpez.com Actus (FR)';
|
|
const URI = 'https://www.developpez.com/';
|
|
const DOMAIN = '.developpez.com/';
|
|
const RSS_URL = 'index/rss';
|
|
const CACHE_TIMEOUT = 1800; // 30min
|
|
const DESCRIPTION = 'Returns complete posts from developpez.com';
|
|
// Encodings used by Developpez.com in their articles body
|
|
const ENCONDINGS = ['Windows-1252', 'UTF-8'];
|
|
const PARAMETERS = [
|
|
[
|
|
'limit' => [
|
|
'name' => 'Max items',
|
|
'type' => 'number',
|
|
'defaultValue' => 5,
|
|
],
|
|
// list of the differents RSS availables
|
|
'domain' => [
|
|
'type' => 'list',
|
|
'name' => 'Domaine',
|
|
'title' => 'Chosissez un sous-domaine',
|
|
'values' => [
|
|
'= Domaine principal =' => 'www',
|
|
'4d' => '4d',
|
|
'abbyy' => 'abbyy',
|
|
'access' => 'access',
|
|
'agile' => 'agile',
|
|
'ajax' => 'ajax',
|
|
'algo' => 'algo',
|
|
'alm' => 'alm',
|
|
'android' => 'android',
|
|
'apache' => 'apache',
|
|
'applications' => 'applications',
|
|
'arduino' => 'arduino',
|
|
'asm' => 'asm',
|
|
'asp' => 'asp',
|
|
'aspose' => 'aspose',
|
|
'bacasable' => 'bacasable',
|
|
'big-data' => 'big-data',
|
|
'bpm' => 'bpm',
|
|
'bsd' => 'bsd',
|
|
'business-intelligence' => 'business-intelligence',
|
|
'c' => 'c',
|
|
'cloud-computing' => 'cloud-computing',
|
|
'club' => 'club',
|
|
'cms' => 'cms',
|
|
'cpp' => 'cpp',
|
|
'crm' => 'crm',
|
|
'css' => 'css',
|
|
'd' => 'd',
|
|
'dart' => 'dart',
|
|
'data-science' => 'data-science',
|
|
'db2' => 'db2',
|
|
'delphi' => 'delphi',
|
|
'dotnet' => 'dotnet',
|
|
'droit' => 'droit',
|
|
'eclipse' => 'eclipse',
|
|
'edi' => 'edi',
|
|
'embarque' => 'embarque',
|
|
'emploi' => 'emploi',
|
|
'etudes' => 'etudes',
|
|
'excel' => 'excel',
|
|
'firebird' => 'firebird',
|
|
'flash' => 'flash',
|
|
'go' => 'go',
|
|
'green-it' => 'green-it',
|
|
'gtk' => 'gtk',
|
|
'hardware' => 'hardware',
|
|
'hpc' => 'hpc',
|
|
'humour' => 'humour',
|
|
'ibmcloud' => 'ibmcloud',
|
|
'intelligence-artificielle' => 'intelligence-artificielle',
|
|
'interbase' => 'interbase',
|
|
'ios' => 'ios',
|
|
'java' => 'java',
|
|
'javascript' => 'javascript',
|
|
'javaweb' => 'javaweb',
|
|
'jetbrains' => 'jetbrains',
|
|
'jeux' => 'jeux',
|
|
'kotlin' => 'kotlin',
|
|
'labview' => 'labview',
|
|
'laravel' => 'laravel',
|
|
'latex' => 'latex',
|
|
'lazarus' => 'lazarus',
|
|
'linux' => 'linux',
|
|
'mac' => 'mac',
|
|
'matlab' => 'matlab',
|
|
'megaoffice' => 'megaoffice',
|
|
'merise' => 'merise',
|
|
'microsoft' => 'microsoft',
|
|
'mobiles' => 'mobiles',
|
|
'mongodb' => 'mongodb',
|
|
'mysql' => 'mysql',
|
|
'netbeans' => 'netbeans',
|
|
'nodejs' => 'nodejs',
|
|
'nosql' => 'nosql',
|
|
'objective-c' => 'objective-c',
|
|
'office' => 'office',
|
|
'open-source' => 'open-source',
|
|
'openoffice-libreoffice' => 'openoffice-libreoffice',
|
|
'oracle' => 'oracle',
|
|
'outlook' => 'outlook',
|
|
'pascal' => 'pascal',
|
|
'perl' => 'perl',
|
|
'php' => 'php',
|
|
'portail-emploi' => 'portail-emploi',
|
|
'portail-projets' => 'portail-projets',
|
|
'postgresql' => 'postgresql',
|
|
'powerpoint' => 'powerpoint',
|
|
'preprod-emploi' => 'preprod-emploi',
|
|
'programmation' => 'programmation',
|
|
'project' => 'project',
|
|
'purebasic' => 'purebasic',
|
|
'pyqt' => 'pyqt',
|
|
'python' => 'python',
|
|
'qt-creator' => 'qt-creator',
|
|
'qt' => 'qt',
|
|
'r' => 'r',
|
|
'raspberry-pi' => 'raspberry-pi',
|
|
'reseau' => 'reseau',
|
|
'ruby' => 'ruby',
|
|
'rust' => 'rust',
|
|
'sap' => 'sap',
|
|
'sas' => 'sas',
|
|
'scilab' => 'scilab',
|
|
'securite' => 'securite',
|
|
'sgbd' => 'sgbd',
|
|
'sharepoint' => 'sharepoint',
|
|
'solutions-entreprise' => 'solutions-entreprise',
|
|
'spring' => 'spring',
|
|
'sqlserver' => 'sqlserver',
|
|
'stages' => 'stages',
|
|
'supervision' => 'supervision',
|
|
'swift' => 'swift',
|
|
'sybase' => 'sybase',
|
|
'symfony' => 'symfony',
|
|
'systeme' => 'systeme',
|
|
'talend' => 'talend',
|
|
'typescript' => 'typescript',
|
|
'uml' => 'uml',
|
|
'unix' => 'unix',
|
|
'vb' => 'vb',
|
|
'vba' => 'vba',
|
|
'virtualisation' => 'virtualisation',
|
|
'visualstudio' => 'visualstudio',
|
|
'web-semantique' => 'web-semantique',
|
|
'web' => 'web',
|
|
'webmarketing' => 'webmarketing',
|
|
'wind' => 'wind',
|
|
'windows-azure' => 'windows-azure',
|
|
'windows' => 'windows',
|
|
'windowsphone' => 'windowsphone',
|
|
'word' => 'word',
|
|
'xhtml' => 'xhtml',
|
|
'xml' => 'xml',
|
|
'zend-framework' => 'zend-framework'
|
|
],
|
|
]
|
|
]
|
|
];
|
|
|
|
/**
|
|
* Grabs the RSS item from Developpez.com
|
|
*/
|
|
public function collectData()
|
|
{
|
|
$url = $this->getRssUrl();
|
|
$this->collectExpandableDatas($url, 20);
|
|
}
|
|
|
|
/**
|
|
* Parse the content of every RSS item. And will try to get the full article
|
|
* pointed by the item URL intead of the default abstract.
|
|
*/
|
|
protected function parseItem(array $item)
|
|
{
|
|
if (count($this->items) >= $this->getInput('limit')) {
|
|
return null;
|
|
}
|
|
|
|
// There is a bug in Developpez RSS, coma are writtent as '~?' in the
|
|
// title, so I have to fix it manually
|
|
$item['title'] = $this->fixComaInTitle($item['title']);
|
|
|
|
// We get the content of the full article behind the RSS item URL
|
|
$articleHTMLContent = getSimpleHTMLDOMCached($item['uri']);
|
|
|
|
// Here we call our custom parser
|
|
$fullText = $this->extractFullText($articleHTMLContent);
|
|
if (!is_null($fullText)) {
|
|
// if we manage to parse the page behind the url of the RSS item
|
|
// then we set it as the new content. Otherwise we keep the default
|
|
// content to avoid RSS Bridge to return an empty item
|
|
$item['content'] = $fullText;
|
|
}
|
|
|
|
// Now we will attach video url in item
|
|
$videosUrl = $this->getAllVideoUrl($articleHTMLContent);
|
|
if (!empty($videosUrl)) {
|
|
$item['enclosures'] = array_merge($item['enclosures'], $videosUrl);
|
|
}
|
|
|
|
// Now we can look for the blog writer/creator
|
|
$author = $articleHTMLContent->find('[itemprop="creator"]', 0);
|
|
if (!empty($author)) {
|
|
$item['author'] = $author->outertext;
|
|
}
|
|
|
|
return $item;
|
|
}
|
|
|
|
/**
|
|
* Return the RSS url for selected domain
|
|
*/
|
|
private function getRssUrl()
|
|
{
|
|
$domain = $this->getInput('domain');
|
|
if (!empty($domain)) {
|
|
return 'https://' . $domain . self::DOMAIN . self::RSS_URL;
|
|
}
|
|
|
|
return self::URI . self::RSS_URL;
|
|
}
|
|
|
|
/**
|
|
* Replace '~?' by a proper coma ','
|
|
*/
|
|
private function fixComaInTitle($txt)
|
|
{
|
|
return str_replace('~?', ',', $txt);
|
|
}
|
|
|
|
/**
|
|
* Return the full article pointed by the url in the RSS item
|
|
* Since Developpez.com only provides a short abstract of the article, we
|
|
* use the url to retrieve the complete article and return it as the content
|
|
*/
|
|
private function extractFullText($articleHTMLContent)
|
|
{
|
|
// All blog entry contains a div with the class 'content'. This div
|
|
// contains the complete blog article. But the RSS can also return
|
|
// announcement and not a blog article. So the next if, should take
|
|
// care of the "non blog" entry
|
|
$divArticleEntry = $articleHTMLContent->find('div.content', 0);
|
|
if (is_null($divArticleEntry)) {
|
|
// Didn't find the div with class content. It is probably not a blog
|
|
// entry. It is probably just an announcement for an ebook, a PDF,
|
|
// etc. So we can use the default RSS item content.
|
|
return null;
|
|
}
|
|
|
|
// The following code is a bit hacky, but I really manage to get the
|
|
// full content of articles without any encoding issues. What is very
|
|
// weird and ugly in Developpez.com is the fact the some paragraphs of
|
|
// the article will be encoded as UTF-8 and some other paragraphs will
|
|
// be encoded as Windows-1252. So we can NOT decode the full article
|
|
// with only one encoding. We have to check every paragraph and
|
|
// determine its encoding
|
|
|
|
// This contains all the 'paragraphs' of the article. It includes the
|
|
// pictures, the text and the links at the bottom of the article
|
|
$paragraphs = $divArticleEntry->nodes;
|
|
// This will store the complete decoded content
|
|
$fullText = '';
|
|
|
|
// For each paragraph, we will identify the encoding, then decode it
|
|
// and finally store the decoded content in $text
|
|
foreach ($paragraphs as $paragraph) {
|
|
// We have to recreate a new DOM document from the current node
|
|
// otherwise the find function will look in the complet article and
|
|
// not only in the current paragraph. This is an ugly behavior of
|
|
// the library Simple HTML DOM Parser...
|
|
$html = str_get_html($paragraph->outertext);
|
|
$fullText .= $this->decodeParagraph($html);
|
|
}
|
|
|
|
// Finally we return the full 'well' enconded content of the article
|
|
return $fullText;
|
|
}
|
|
|
|
/**
|
|
*
|
|
*/
|
|
private function decodeParagraph($p)
|
|
{
|
|
// First we check if this paragraph is a video
|
|
$videoUrl = $this->getVideoUrl($p);
|
|
if (!empty($videoUrl)) {
|
|
// If this is a video, we just return a link to the video
|
|
// 📺 => 🎞️
|
|
return '<p>
|
|
<b>📺 <a href="' . $videoUrl . '">Voir la vidéo</a></b>
|
|
</p>';
|
|
}
|
|
|
|
// We take outertext to get the complete paragraph not only the text
|
|
// inside it. That way we still graph block <img> and so on.
|
|
$pTxt = $p->outertext;
|
|
// This will store the decoded text if we manage to decode it
|
|
$decodedTxt = '';
|
|
|
|
// This is the only way to properly decode each paragraph. I tried
|
|
// many stuffs but this is the only working way I found.
|
|
foreach (self::ENCONDINGS as $enc) {
|
|
// We check the encoding of the current paragraph
|
|
if (mb_check_encoding($pTxt, $enc)) {
|
|
// If the encoding is well recognized, we can convert from
|
|
// this encoding to UTF-8
|
|
$decodedTxt = iconv($enc, 'UTF-8', $pTxt);
|
|
}
|
|
}
|
|
|
|
// We should not trim the strings to avoid the <a> to be glued to the
|
|
// text like: the software<a href="...">started</a>to...
|
|
if (!empty($decodedTxt)) {
|
|
// We manage to decode the text, so we take the decoded version
|
|
return $this->formatParagraph($decodedTxt);
|
|
} else {
|
|
// Otherwise we take the non decoded version and hope it will
|
|
// be displayed not too ugly in the fulltext content
|
|
return $this->formatParagraph($pTxt);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Return true in $txt is a HTML tag and not plain text
|
|
*/
|
|
private function isHtmlTagNotTxt($txt)
|
|
{
|
|
if ($txt === '') {
|
|
return false;
|
|
}
|
|
$html = str_get_html($txt);
|
|
return $html && $html->root && count($html->root->children) > 0;
|
|
}
|
|
|
|
/**
|
|
* Will add a space before paragraph when needed
|
|
*/
|
|
private function formatParagraph($txt)
|
|
{
|
|
// If the paragraph is an html tag, we add a space before
|
|
if ($this->isHtmlTagNotTxt($txt)) {
|
|
// the first element is an html tag and not a text, so we can add a
|
|
// space before it
|
|
return ' ' . $txt;
|
|
}
|
|
// If the text start with word (not punctation), we had a space
|
|
$pattern = '/^\w/';
|
|
if (preg_match($pattern, $txt)) {
|
|
return ' ' . $txt;
|
|
}
|
|
return $txt;
|
|
}
|
|
|
|
/**
|
|
* Retrieve all video url in the article
|
|
*/
|
|
private function getAllVideoUrl($item)
|
|
{
|
|
// Array of video url
|
|
$url = [];
|
|
|
|
// Developpez use a div with the class video-container
|
|
$divsVideo = $item->find('div.video-container');
|
|
if (empty($divsVideo)) {
|
|
return $url;
|
|
}
|
|
|
|
// get the url of the video
|
|
foreach ($divsVideo as $div) {
|
|
$html = str_get_html($div->outertext);
|
|
$url[] = $this->getVideoUrl($html);
|
|
}
|
|
|
|
return $url;
|
|
}
|
|
|
|
/**
|
|
* Retrieve URL video. We have to check for the src of an iframe
|
|
* Work for Youtube. Will have to test for other video platform
|
|
*/
|
|
private function getVideoUrl($p)
|
|
{
|
|
$divVideo = $p->find('div.video-container', 0);
|
|
if (empty($divVideo)) {
|
|
return null;
|
|
}
|
|
$iframe = $divVideo->find('iframe', 0);
|
|
if (empty($iframe)) {
|
|
return null;
|
|
}
|
|
$src = trim($iframe->getAttribute('src'));
|
|
if (empty($src)) {
|
|
return null;
|
|
}
|
|
if (str_starts_with($src, '//')) {
|
|
$src = 'https:' . $src;
|
|
}
|
|
return $src;
|
|
}
|
|
}
|