mirror of
https://github.com/RSS-Bridge/rss-bridge.git
synced 2025-03-14 20:21:14 +03:00
Full rewrite of bridge DeveloppezDotCom (#2689)
This commit is contained in:
parent
410daee1d5
commit
bc773a49f8
1 changed files with 390 additions and 30 deletions
|
@ -1,47 +1,407 @@
|
|||
<?php
|
||||
class DeveloppezDotComBridge extends FeedExpander {
|
||||
|
||||
const MAINTAINER = 'polopollo';
|
||||
class DeveloppezDotComBridge extends FeedExpander
|
||||
{
|
||||
|
||||
const MAINTAINER = 'Binnette';
|
||||
const NAME = 'Developpez.com Actus (FR)';
|
||||
const URI = 'https://www.developpez.com/';
|
||||
const DOMAIN = '.developpez.com/';
|
||||
const RSS_URL = 'index/rss';
|
||||
const CACHE_TIMEOUT = 1800; // 30min
|
||||
const DESCRIPTION = 'Returns the 15 newest posts from DeveloppezDotCom (full text).';
|
||||
const DESCRIPTION = 'Returns complete posts from developpez.com';
|
||||
// Encodings used by Developpez.com in their articles body
|
||||
const ENCONDINGS = array('Windows-1252', 'UTF-8');
|
||||
const PARAMETERS = array(
|
||||
array(
|
||||
'limit' => array(
|
||||
'name' => 'Max items',
|
||||
'type' => 'number',
|
||||
'defaultValue' => 5,
|
||||
),
|
||||
// list of the differents RSS availables
|
||||
'domain' => array(
|
||||
'type' => 'list',
|
||||
'name' => 'Domaine',
|
||||
'title' => 'Chosissez un sous-domaine',
|
||||
'values' => array(
|
||||
'= Domaine principal =' => 'www',
|
||||
'4d' => '4d',
|
||||
'abbyy' => 'abbyy',
|
||||
'access' => 'access',
|
||||
'agile' => 'agile',
|
||||
'ajax' => 'ajax',
|
||||
'algo' => 'algo',
|
||||
'alm' => 'alm',
|
||||
'android' => 'android',
|
||||
'apache' => 'apache',
|
||||
'applications' => 'applications',
|
||||
'arduino' => 'arduino',
|
||||
'asm' => 'asm',
|
||||
'asp' => 'asp',
|
||||
'aspose' => 'aspose',
|
||||
'bacasable' => 'bacasable',
|
||||
'big-data' => 'big-data',
|
||||
'bpm' => 'bpm',
|
||||
'bsd' => 'bsd',
|
||||
'business-intelligence' => 'business-intelligence',
|
||||
'c' => 'c',
|
||||
'cloud-computing' => 'cloud-computing',
|
||||
'club' => 'club',
|
||||
'cms' => 'cms',
|
||||
'cpp' => 'cpp',
|
||||
'crm' => 'crm',
|
||||
'css' => 'css',
|
||||
'd' => 'd',
|
||||
'dart' => 'dart',
|
||||
'data-science' => 'data-science',
|
||||
'db2' => 'db2',
|
||||
'delphi' => 'delphi',
|
||||
'dotnet' => 'dotnet',
|
||||
'droit' => 'droit',
|
||||
'eclipse' => 'eclipse',
|
||||
'edi' => 'edi',
|
||||
'embarque' => 'embarque',
|
||||
'emploi' => 'emploi',
|
||||
'etudes' => 'etudes',
|
||||
'excel' => 'excel',
|
||||
'firebird' => 'firebird',
|
||||
'flash' => 'flash',
|
||||
'go' => 'go',
|
||||
'green-it' => 'green-it',
|
||||
'gtk' => 'gtk',
|
||||
'hardware' => 'hardware',
|
||||
'hpc' => 'hpc',
|
||||
'humour' => 'humour',
|
||||
'ibmcloud' => 'ibmcloud',
|
||||
'intelligence-artificielle' => 'intelligence-artificielle',
|
||||
'interbase' => 'interbase',
|
||||
'ios' => 'ios',
|
||||
'java' => 'java',
|
||||
'javascript' => 'javascript',
|
||||
'javaweb' => 'javaweb',
|
||||
'jetbrains' => 'jetbrains',
|
||||
'jeux' => 'jeux',
|
||||
'kotlin' => 'kotlin',
|
||||
'labview' => 'labview',
|
||||
'laravel' => 'laravel',
|
||||
'latex' => 'latex',
|
||||
'lazarus' => 'lazarus',
|
||||
'linux' => 'linux',
|
||||
'mac' => 'mac',
|
||||
'matlab' => 'matlab',
|
||||
'megaoffice' => 'megaoffice',
|
||||
'merise' => 'merise',
|
||||
'microsoft' => 'microsoft',
|
||||
'mobiles' => 'mobiles',
|
||||
'mongodb' => 'mongodb',
|
||||
'mysql' => 'mysql',
|
||||
'netbeans' => 'netbeans',
|
||||
'nodejs' => 'nodejs',
|
||||
'nosql' => 'nosql',
|
||||
'objective-c' => 'objective-c',
|
||||
'office' => 'office',
|
||||
'open-source' => 'open-source',
|
||||
'openoffice-libreoffice' => 'openoffice-libreoffice',
|
||||
'oracle' => 'oracle',
|
||||
'outlook' => 'outlook',
|
||||
'pascal' => 'pascal',
|
||||
'perl' => 'perl',
|
||||
'php' => 'php',
|
||||
'portail-emploi' => 'portail-emploi',
|
||||
'portail-projets' => 'portail-projets',
|
||||
'postgresql' => 'postgresql',
|
||||
'powerpoint' => 'powerpoint',
|
||||
'preprod-emploi' => 'preprod-emploi',
|
||||
'programmation' => 'programmation',
|
||||
'project' => 'project',
|
||||
'purebasic' => 'purebasic',
|
||||
'pyqt' => 'pyqt',
|
||||
'python' => 'python',
|
||||
'qt-creator' => 'qt-creator',
|
||||
'qt' => 'qt',
|
||||
'r' => 'r',
|
||||
'raspberry-pi' => 'raspberry-pi',
|
||||
'reseau' => 'reseau',
|
||||
'ruby' => 'ruby',
|
||||
'rust' => 'rust',
|
||||
'sap' => 'sap',
|
||||
'sas' => 'sas',
|
||||
'scilab' => 'scilab',
|
||||
'securite' => 'securite',
|
||||
'sgbd' => 'sgbd',
|
||||
'sharepoint' => 'sharepoint',
|
||||
'solutions-entreprise' => 'solutions-entreprise',
|
||||
'spring' => 'spring',
|
||||
'sqlserver' => 'sqlserver',
|
||||
'stages' => 'stages',
|
||||
'supervision' => 'supervision',
|
||||
'swift' => 'swift',
|
||||
'sybase' => 'sybase',
|
||||
'symfony' => 'symfony',
|
||||
'systeme' => 'systeme',
|
||||
'talend' => 'talend',
|
||||
'typescript' => 'typescript',
|
||||
'uml' => 'uml',
|
||||
'unix' => 'unix',
|
||||
'vb' => 'vb',
|
||||
'vba' => 'vba',
|
||||
'virtualisation' => 'virtualisation',
|
||||
'visualstudio' => 'visualstudio',
|
||||
'web-semantique' => 'web-semantique',
|
||||
'web' => 'web',
|
||||
'webmarketing' => 'webmarketing',
|
||||
'wind' => 'wind',
|
||||
'windows-azure' => 'windows-azure',
|
||||
'windows' => 'windows',
|
||||
'windowsphone' => 'windowsphone',
|
||||
'word' => 'word',
|
||||
'xhtml' => 'xhtml',
|
||||
'xml' => 'xml',
|
||||
'zend-framework' => 'zend-framework'
|
||||
),
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
public function collectData(){
|
||||
$this->collectExpandableDatas(self::URI . 'index/rss', 15);
|
||||
/**
|
||||
* Return the RSS url for selected domain
|
||||
*/
|
||||
private function getRssUrl()
|
||||
{
|
||||
$domain = $this->getInput('domain');
|
||||
if (!empty($domain)) {
|
||||
return 'https://' . $domain . self::DOMAIN . self::RSS_URL;
|
||||
}
|
||||
|
||||
return self::URI . self::RSS_URL;
|
||||
}
|
||||
|
||||
protected function parseItem($newsItem){
|
||||
/**
|
||||
* Grabs the RSS item from Developpez.com
|
||||
*/
|
||||
public function collectData()
|
||||
{
|
||||
$url = $this->getRssUrl();
|
||||
$this->collectExpandableDatas($url, 20);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse the content of every RSS item. And will try to get the full article
|
||||
* pointed by the item URL intead of the default abstract.
|
||||
*/
|
||||
protected function parseItem($newsItem)
|
||||
{
|
||||
if (count($this->items) >= $this->getInput('limit')) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// This function parse each entry in the RSS with the default parse
|
||||
$item = parent::parseItem($newsItem);
|
||||
$item['content'] = $this->extractContent($item['uri']);
|
||||
|
||||
// There is a bug in Developpez RSS, coma are writtent as '~?' in the
|
||||
// title, so I have to fix it manually
|
||||
$item['title'] = $this->fixComaInTitle($item['title']);
|
||||
|
||||
// We get the content of the full article behind the RSS item URL
|
||||
$articleHTMLContent = getSimpleHTMLDOMCached($item['uri']);
|
||||
|
||||
// Here we call our custom parser
|
||||
$fullText = $this->extractFullText($articleHTMLContent);
|
||||
if (!is_null($fullText)) {
|
||||
// if we manage to parse the page behind the url of the RSS item
|
||||
// then we set it as the new content. Otherwise we keep the default
|
||||
// content to avoid RSS Bridge to return an empty item
|
||||
$item['content'] = $fullText;
|
||||
}
|
||||
|
||||
// Now we will attach video url in item
|
||||
$videosUrl = $this->getAllVideoUrl($articleHTMLContent);
|
||||
if (!empty($videosUrl)) {
|
||||
$item['enclosures'] = array_merge($item['enclosures'], $videosUrl);
|
||||
}
|
||||
|
||||
// Now we can look for the blog writer/creator
|
||||
$author = $articleHTMLContent->find('[itemprop="creator"]', 0);
|
||||
if (!empty($author)) {
|
||||
$item['author'] = $author->outertext;
|
||||
}
|
||||
|
||||
return $item;
|
||||
}
|
||||
|
||||
// F***ing quotes from Microsoft Word badly encoded, here was the trick:
|
||||
// http://stackoverflow.com/questions/1262038/how-to-replace-microsoft-encoded-quotes-in-php
|
||||
private function convertSmartQuotes($string)
|
||||
/**
|
||||
* Replace '~?' by a proper coma ','
|
||||
*/
|
||||
private function fixComaInTitle($txt)
|
||||
{
|
||||
$search = array(chr(145),
|
||||
chr(146),
|
||||
chr(147),
|
||||
chr(148),
|
||||
chr(151));
|
||||
|
||||
$replace = array(
|
||||
"'",
|
||||
"'",
|
||||
'"',
|
||||
'"',
|
||||
'-'
|
||||
);
|
||||
|
||||
return str_replace($search, $replace, $string);
|
||||
return str_replace('~?', ',', $txt);
|
||||
}
|
||||
|
||||
private function extractContent($url){
|
||||
$articleHTMLContent = getSimpleHTMLDOMCached($url);
|
||||
$text = $this->convertSmartQuotes($articleHTMLContent->find('div.content', 0)->innertext);
|
||||
$text = utf8_encode($text);
|
||||
return trim($text);
|
||||
/**
|
||||
* Return the full article pointed by the url in the RSS item
|
||||
* Since Developpez.com only provides a short abstract of the article, we
|
||||
* use the url to retrieve the complete article and return it as the content
|
||||
*/
|
||||
private function extractFullText($articleHTMLContent)
|
||||
{
|
||||
// All blog entry contains a div with the class 'content'. This div
|
||||
// contains the complete blog article. But the RSS can also return
|
||||
// announcement and not a blog article. So the next if, should take
|
||||
// care of the "non blog" entry
|
||||
$divArticleEntry = $articleHTMLContent->find('div.content', 0);
|
||||
if (is_null($divArticleEntry)) {
|
||||
// Didn't find the div with class content. It is probably not a blog
|
||||
// entry. It is probably just an announcement for an ebook, a PDF,
|
||||
// etc. So we can use the default RSS item content.
|
||||
return null;
|
||||
}
|
||||
|
||||
// The following code is a bit hacky, but I really manage to get the
|
||||
// full content of articles without any encoding issues. What is very
|
||||
// weird and ugly in Developpez.com is the fact the some paragraphs of
|
||||
// the article will be encoded as UTF-8 and some other paragraphs will
|
||||
// be encoded as Windows-1252. So we can NOT decode the full article
|
||||
// with only one encoding. We have to check every paragraph and
|
||||
// determine its encoding
|
||||
|
||||
// This contains all the 'paragraphs' of the article. It includes the
|
||||
// pictures, the text and the links at the bottom of the article
|
||||
$paragraphs = $divArticleEntry->nodes;
|
||||
// This will store the complete decoded content
|
||||
$fullText = '';
|
||||
|
||||
// For each paragraph, we will identify the encoding, then decode it
|
||||
// and finally store the decoded content in $text
|
||||
foreach ($paragraphs as $paragraph) {
|
||||
// We have to recreate a new DOM document from the current node
|
||||
// otherwise the find function will look in the complet article and
|
||||
// not only in the current paragraph. This is an ugly behavior of
|
||||
// the library Simple HTML DOM Parser...
|
||||
$html = str_get_html($paragraph->outertext);
|
||||
$fullText .= $this->decodeParagraph($html);
|
||||
}
|
||||
|
||||
// Finally we return the full 'well' enconded content of the article
|
||||
return $fullText;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private function decodeParagraph($p)
|
||||
{
|
||||
// First we check if this paragraph is a video
|
||||
$videoUrl = $this->getVideoUrl($p);
|
||||
if (!empty($videoUrl)) {
|
||||
// If this is a video, we just return a link to the video
|
||||
// 📺 => 🎞️
|
||||
return '<p>
|
||||
<b>📺 <a href="' . $videoUrl . '">Voir la vidéo</a></b>
|
||||
</p>';
|
||||
}
|
||||
|
||||
// We take outertext to get the complete paragraph not only the text
|
||||
// inside it. That way we still graph block <img> and so on.
|
||||
$pTxt = $p->outertext;
|
||||
// This will store the decoded text if we manage to decode it
|
||||
$decodedTxt = '';
|
||||
|
||||
// This is the only way to properly decode each paragraph. I tried
|
||||
// many stuffs but this is the only working way I found.
|
||||
foreach (self::ENCONDINGS as $enc) {
|
||||
// We check the encoding of the current paragraph
|
||||
if (mb_check_encoding($pTxt, $enc)) {
|
||||
// If the encoding is well recognized, we can convert from
|
||||
// this encoding to UTF-8
|
||||
$decodedTxt = iconv($enc, 'UTF-8', $pTxt);
|
||||
}
|
||||
}
|
||||
|
||||
// We should not trim the strings to avoid the <a> to be glued to the
|
||||
// text like: the software<a href="...">started</a>to...
|
||||
if (!empty($decodedTxt)) {
|
||||
// We manage to decode the text, so we take the decoded version
|
||||
return $this->formatParagraph($decodedTxt);
|
||||
} else {
|
||||
// Otherwise we take the non decoded version and hope it will
|
||||
// be displayed not too ugly in the fulltext content
|
||||
return $this->formatParagraph($pTxt);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true in $txt is a HTML tag and not plain text
|
||||
*/
|
||||
private function isHtmlTagNotTxt($txt)
|
||||
{
|
||||
$html = str_get_html($txt);
|
||||
return $html && $html->root && count($html->root->children) > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Will add a space before paragraph when needed
|
||||
*/
|
||||
private function formatParagraph($txt)
|
||||
{
|
||||
// If the paragraph is an html tag, we add a space before
|
||||
if ($this->isHtmlTagNotTxt($txt)) {
|
||||
// the first element is an html tag and not a text, so we can add a
|
||||
// space before it
|
||||
return ' ' . $txt;
|
||||
}
|
||||
// If the text start with word (not punctation), we had a space
|
||||
$pattern = '/^\w/';
|
||||
if (preg_match($pattern, $txt)) {
|
||||
return ' ' . $txt;
|
||||
}
|
||||
return $txt;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve all video url in the article
|
||||
*/
|
||||
private function getAllVideoUrl($item)
|
||||
{
|
||||
// Array of video url
|
||||
$url = array();
|
||||
|
||||
// Developpez use a div with the class video-container
|
||||
$divsVideo = $item->find('div.video-container');
|
||||
if (empty($divsVideo)) {
|
||||
return $url;
|
||||
}
|
||||
|
||||
// get the url of the video
|
||||
foreach ($divsVideo as $div) {
|
||||
$html = str_get_html($div->outertext);
|
||||
$url[] = $this->getVideoUrl($html);
|
||||
}
|
||||
|
||||
return $url;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve URL video. We have to check for the src of an iframe
|
||||
* Work for Youtube. Will have to test for other video platform
|
||||
*/
|
||||
private function getVideoUrl($p)
|
||||
{
|
||||
$divVideo = $p->find('div.video-container', 0);
|
||||
if (empty($divVideo)) {
|
||||
return null;
|
||||
}
|
||||
$iframe = $divVideo->find('iframe', 0);
|
||||
if (empty($iframe)) {
|
||||
return null;
|
||||
}
|
||||
$src = trim($iframe->getAttribute('src'));
|
||||
if (empty($src)) {
|
||||
return null;
|
||||
}
|
||||
if (str_starts_with($src, '//')) {
|
||||
$src = 'https:' . $src;
|
||||
}
|
||||
return $src;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue