rss-bridge/bridges/DeveloppezDotComBridge.php

407 lines
15 KiB
PHP
Raw Normal View History

2014-07-14 12:41:09 -05:00
<?php
class DeveloppezDotComBridge extends FeedExpander
{
const MAINTAINER = 'Binnette';
const NAME = 'Developpez.com Actus (FR)';
const URI = 'https://www.developpez.com/';
const DOMAIN = '.developpez.com/';
const RSS_URL = 'index/rss';
const CACHE_TIMEOUT = 1800; // 30min
const DESCRIPTION = 'Returns complete posts from developpez.com';
// Encodings used by Developpez.com in their articles body
const ENCONDINGS = ['Windows-1252', 'UTF-8'];
const PARAMETERS = [
[
'limit' => [
'name' => 'Max items',
'type' => 'number',
'defaultValue' => 5,
],
// list of the differents RSS availables
'domain' => [
'type' => 'list',
'name' => 'Domaine',
'title' => 'Chosissez un sous-domaine',
'values' => [
'= Domaine principal =' => 'www',
'4d' => '4d',
'abbyy' => 'abbyy',
'access' => 'access',
'agile' => 'agile',
'ajax' => 'ajax',
'algo' => 'algo',
'alm' => 'alm',
'android' => 'android',
'apache' => 'apache',
'applications' => 'applications',
'arduino' => 'arduino',
'asm' => 'asm',
'asp' => 'asp',
'aspose' => 'aspose',
'bacasable' => 'bacasable',
'big-data' => 'big-data',
'bpm' => 'bpm',
'bsd' => 'bsd',
'business-intelligence' => 'business-intelligence',
'c' => 'c',
'cloud-computing' => 'cloud-computing',
'club' => 'club',
'cms' => 'cms',
'cpp' => 'cpp',
'crm' => 'crm',
'css' => 'css',
'd' => 'd',
'dart' => 'dart',
'data-science' => 'data-science',
'db2' => 'db2',
'delphi' => 'delphi',
'dotnet' => 'dotnet',
'droit' => 'droit',
'eclipse' => 'eclipse',
'edi' => 'edi',
'embarque' => 'embarque',
'emploi' => 'emploi',
'etudes' => 'etudes',
'excel' => 'excel',
'firebird' => 'firebird',
'flash' => 'flash',
'go' => 'go',
'green-it' => 'green-it',
'gtk' => 'gtk',
'hardware' => 'hardware',
'hpc' => 'hpc',
'humour' => 'humour',
'ibmcloud' => 'ibmcloud',
'intelligence-artificielle' => 'intelligence-artificielle',
'interbase' => 'interbase',
'ios' => 'ios',
'java' => 'java',
'javascript' => 'javascript',
'javaweb' => 'javaweb',
'jetbrains' => 'jetbrains',
'jeux' => 'jeux',
'kotlin' => 'kotlin',
'labview' => 'labview',
'laravel' => 'laravel',
'latex' => 'latex',
'lazarus' => 'lazarus',
'linux' => 'linux',
'mac' => 'mac',
'matlab' => 'matlab',
'megaoffice' => 'megaoffice',
'merise' => 'merise',
'microsoft' => 'microsoft',
'mobiles' => 'mobiles',
'mongodb' => 'mongodb',
'mysql' => 'mysql',
'netbeans' => 'netbeans',
'nodejs' => 'nodejs',
'nosql' => 'nosql',
'objective-c' => 'objective-c',
'office' => 'office',
'open-source' => 'open-source',
'openoffice-libreoffice' => 'openoffice-libreoffice',
'oracle' => 'oracle',
'outlook' => 'outlook',
'pascal' => 'pascal',
'perl' => 'perl',
'php' => 'php',
'portail-emploi' => 'portail-emploi',
'portail-projets' => 'portail-projets',
'postgresql' => 'postgresql',
'powerpoint' => 'powerpoint',
'preprod-emploi' => 'preprod-emploi',
'programmation' => 'programmation',
'project' => 'project',
'purebasic' => 'purebasic',
'pyqt' => 'pyqt',
'python' => 'python',
'qt-creator' => 'qt-creator',
'qt' => 'qt',
'r' => 'r',
'raspberry-pi' => 'raspberry-pi',
'reseau' => 'reseau',
'ruby' => 'ruby',
'rust' => 'rust',
'sap' => 'sap',
'sas' => 'sas',
'scilab' => 'scilab',
'securite' => 'securite',
'sgbd' => 'sgbd',
'sharepoint' => 'sharepoint',
'solutions-entreprise' => 'solutions-entreprise',
'spring' => 'spring',
'sqlserver' => 'sqlserver',
'stages' => 'stages',
'supervision' => 'supervision',
'swift' => 'swift',
'sybase' => 'sybase',
'symfony' => 'symfony',
'systeme' => 'systeme',
'talend' => 'talend',
'typescript' => 'typescript',
'uml' => 'uml',
'unix' => 'unix',
'vb' => 'vb',
'vba' => 'vba',
'virtualisation' => 'virtualisation',
'visualstudio' => 'visualstudio',
'web-semantique' => 'web-semantique',
'web' => 'web',
'webmarketing' => 'webmarketing',
'wind' => 'wind',
'windows-azure' => 'windows-azure',
'windows' => 'windows',
'windowsphone' => 'windowsphone',
'word' => 'word',
'xhtml' => 'xhtml',
'xml' => 'xml',
'zend-framework' => 'zend-framework'
],
]
]
];
/**
* Grabs the RSS item from Developpez.com
*/
public function collectData()
{
$url = $this->getRssUrl();
$this->collectExpandableDatas($url, 20);
}
/**
* Parse the content of every RSS item. And will try to get the full article
* pointed by the item URL intead of the default abstract.
*/
protected function parseItem(array $item)
{
if (count($this->items) >= $this->getInput('limit')) {
return null;
}
// There is a bug in Developpez RSS, coma are writtent as '~?' in the
// title, so I have to fix it manually
$item['title'] = $this->fixComaInTitle($item['title']);
// We get the content of the full article behind the RSS item URL
$articleHTMLContent = getSimpleHTMLDOMCached($item['uri']);
// Here we call our custom parser
$fullText = $this->extractFullText($articleHTMLContent);
if (!is_null($fullText)) {
// if we manage to parse the page behind the url of the RSS item
// then we set it as the new content. Otherwise we keep the default
// content to avoid RSS Bridge to return an empty item
$item['content'] = $fullText;
}
// Now we will attach video url in item
$videosUrl = $this->getAllVideoUrl($articleHTMLContent);
if (!empty($videosUrl)) {
$item['enclosures'] = array_merge($item['enclosures'], $videosUrl);
}
// Now we can look for the blog writer/creator
$author = $articleHTMLContent->find('[itemprop="creator"]', 0);
if (!empty($author)) {
$item['author'] = $author->outertext;
}
return $item;
}
/**
* Return the RSS url for selected domain
*/
private function getRssUrl()
{
$domain = $this->getInput('domain');
if (!empty($domain)) {
return 'https://' . $domain . self::DOMAIN . self::RSS_URL;
}
return self::URI . self::RSS_URL;
}
/**
* Replace '~?' by a proper coma ','
*/
private function fixComaInTitle($txt)
{
return str_replace('~?', ',', $txt);
}
/**
* Return the full article pointed by the url in the RSS item
* Since Developpez.com only provides a short abstract of the article, we
* use the url to retrieve the complete article and return it as the content
*/
private function extractFullText($articleHTMLContent)
{
// All blog entry contains a div with the class 'content'. This div
// contains the complete blog article. But the RSS can also return
// announcement and not a blog article. So the next if, should take
// care of the "non blog" entry
$divArticleEntry = $articleHTMLContent->find('div.content', 0);
if (is_null($divArticleEntry)) {
// Didn't find the div with class content. It is probably not a blog
// entry. It is probably just an announcement for an ebook, a PDF,
// etc. So we can use the default RSS item content.
return null;
}
// The following code is a bit hacky, but I really manage to get the
// full content of articles without any encoding issues. What is very
// weird and ugly in Developpez.com is the fact the some paragraphs of
// the article will be encoded as UTF-8 and some other paragraphs will
// be encoded as Windows-1252. So we can NOT decode the full article
// with only one encoding. We have to check every paragraph and
// determine its encoding
// This contains all the 'paragraphs' of the article. It includes the
// pictures, the text and the links at the bottom of the article
$paragraphs = $divArticleEntry->nodes;
// This will store the complete decoded content
$fullText = '';
// For each paragraph, we will identify the encoding, then decode it
// and finally store the decoded content in $text
foreach ($paragraphs as $paragraph) {
// We have to recreate a new DOM document from the current node
// otherwise the find function will look in the complet article and
// not only in the current paragraph. This is an ugly behavior of
// the library Simple HTML DOM Parser...
$html = str_get_html($paragraph->outertext);
$fullText .= $this->decodeParagraph($html);
}
// Finally we return the full 'well' enconded content of the article
return $fullText;
}
/**
*
*/
private function decodeParagraph($p)
{
// First we check if this paragraph is a video
$videoUrl = $this->getVideoUrl($p);
if (!empty($videoUrl)) {
// If this is a video, we just return a link to the video
// &#128250; => 🎞️
return '<p>
<b>&#128250; <a href="' . $videoUrl . '">Voir la vidéo</a></b>
</p>';
}
2014-07-14 12:41:09 -05:00
// We take outertext to get the complete paragraph not only the text
// inside it. That way we still graph block <img> and so on.
$pTxt = $p->outertext;
// This will store the decoded text if we manage to decode it
$decodedTxt = '';
// This is the only way to properly decode each paragraph. I tried
// many stuffs but this is the only working way I found.
foreach (self::ENCONDINGS as $enc) {
// We check the encoding of the current paragraph
if (mb_check_encoding($pTxt, $enc)) {
// If the encoding is well recognized, we can convert from
// this encoding to UTF-8
$decodedTxt = iconv($enc, 'UTF-8', $pTxt);
}
}
// We should not trim the strings to avoid the <a> to be glued to the
// text like: the software<a href="...">started</a>to...
if (!empty($decodedTxt)) {
// We manage to decode the text, so we take the decoded version
return $this->formatParagraph($decodedTxt);
} else {
// Otherwise we take the non decoded version and hope it will
// be displayed not too ugly in the fulltext content
return $this->formatParagraph($pTxt);
}
}
/**
* Return true in $txt is a HTML tag and not plain text
*/
private function isHtmlTagNotTxt($txt)
{
if ($txt === '') {
return false;
}
$html = str_get_html($txt);
return $html && $html->root && count($html->root->children) > 0;
}
/**
* Will add a space before paragraph when needed
*/
private function formatParagraph($txt)
{
// If the paragraph is an html tag, we add a space before
if ($this->isHtmlTagNotTxt($txt)) {
// the first element is an html tag and not a text, so we can add a
// space before it
return ' ' . $txt;
}
// If the text start with word (not punctation), we had a space
$pattern = '/^\w/';
if (preg_match($pattern, $txt)) {
return ' ' . $txt;
}
return $txt;
}
/**
* Retrieve all video url in the article
*/
private function getAllVideoUrl($item)
{
// Array of video url
$url = [];
// Developpez use a div with the class video-container
$divsVideo = $item->find('div.video-container');
if (empty($divsVideo)) {
return $url;
}
// get the url of the video
foreach ($divsVideo as $div) {
$html = str_get_html($div->outertext);
$url[] = $this->getVideoUrl($html);
}
return $url;
}
/**
* Retrieve URL video. We have to check for the src of an iframe
* Work for Youtube. Will have to test for other video platform
*/
private function getVideoUrl($p)
{
$divVideo = $p->find('div.video-container', 0);
if (empty($divVideo)) {
return null;
}
$iframe = $divVideo->find('iframe', 0);
if (empty($iframe)) {
return null;
}
$src = trim($iframe->getAttribute('src'));
if (empty($src)) {
return null;
}
if (str_starts_with($src, '//')) {
$src = 'https:' . $src;
}
return $src;
}
2014-07-14 12:41:09 -05:00
}