diff --git a/bridges/CADBridge.php b/bridges/CADBridge.php index 47ff165a..eb05fd16 100644 --- a/bridges/CADBridge.php +++ b/bridges/CADBridge.php @@ -1,12 +1,22 @@ collectExpandableDatas('http://cdn2.cad-comic.com/rss.xml'); + } + + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); + $item['content'] = $this->CADExtractContent($item['uri']); + return $item; + } + private function CADExtractContent($url) { - $html3 = $this->getSimpleHTMLDOM($url); + $html3 = $this->get_cached($url); // The request might fail due to missing https support or wrong URL if($html3 == false) @@ -32,33 +42,6 @@ class CADBridge extends BridgeAbstract{ return ''; } - public function collectData(){ - function CADUrl($string) { - $html2 = explode("\"", $string); - $string = $html2[1]; - if (substr($string,0,4) != 'http') - return 'notanurl'; - return $string; - } - - $html = $this->getSimpleHTMLDOM('http://cdn2.cad-comic.com/rss.xml') or $this->returnServerError('Could not request CAD.'); - $limit = 0; - - foreach($html->find('item') as $element) { - if($limit < 5) { - $item = array(); - $item['title'] = $element->find('title', 0)->innertext; - $item['uri'] = CADUrl($element->find('description', 0)->innertext); - if ($item['uri'] != 'notanurl') { - $item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext); - $item['content'] = $this->CADExtractContent($item['uri']); - $this->items[] = $item; - $limit++; - } - } - } - } - public function getCacheDuration(){ return 3600*2; // 2 hours } diff --git a/bridges/CommonDreamsBridge.php b/bridges/CommonDreamsBridge.php index 446a6df0..e621db41 100644 --- a/bridges/CommonDreamsBridge.php +++ b/bridges/CommonDreamsBridge.php @@ -1,39 +1,26 @@ collectExpandableDatas('http://www.commondreams.org/rss.xml'); + } + + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); + $item['content'] = $this->CommonDreamsExtractContent($item['uri']); + return $item; + } + private function CommonDreamsExtractContent($url) { - $html3 = $this->getSimpleHTMLDOM($url); + $html3 = $this->get_cached($url); $text = $html3->find('div[class=field--type-text-with-summary]', 0)->innertext; $html3->clear(); unset ($html3); return $text; } - - public function collectData(){ - - function CommonDreamsUrl($string) { - $html2 = explode(" ", $string); - $string = $html2[2] . "/node/" . $html2[0]; - return $string; - } - - $html = $this->getSimpleHTMLDOM('http://www.commondreams.org/rss.xml') or $this->returnServerError('Could not request CommonDreams.'); - $limit = 0; - foreach($html->find('item') as $element) { - if($limit < 4) { - $item = array(); - $item['title'] = $element->find('title', 0)->innertext; - $item['uri'] = CommonDreamsUrl($element->find('guid', 0)->innertext); - $item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext); - $item['content'] = $this->CommonDreamsExtractContent($item['uri']); - $this->items[] = $item; - $limit++; - } - } - } } diff --git a/bridges/DauphineLibereBridge.php b/bridges/DauphineLibereBridge.php index 143a6c0a..d8e10ddb 100644 --- a/bridges/DauphineLibereBridge.php +++ b/bridges/DauphineLibereBridge.php @@ -1,10 +1,10 @@ array( @@ -30,41 +30,31 @@ class DauphineLibereBridge extends BridgeAbstract { ) )); - private function ExtractContent($url, $context) { - $html2 = $this->getSimpleHTMLDOM($url); - $text = $html2->find('div.column', 0)->innertext; - $text = preg_replace('@]*?>.*?@si', '', $text); - return $text; - } + public function collectData(){ + $url = self::URI . 'rss'; - public function collectData(){ + if (empty($this->getInput('u'))) { + $url = self::URI . $this->getInput('u') . '/rss'; + } - $context = stream_context_create($opts); + $this->collectExpandableDatas($url); + } - if (empty($this->getInput('u'))) { - $html = $this->getSimpleHTMLDOM(self::URI.$this->getInput('u').'/rss') - or $this->returnServerError('Could not request DauphineLibere.'); - } else { - $html = $this->getSimpleHTMLDOM(self::URI.'rss') - or $this->returnServerError('Could not request DauphineLibere.'); - } - $limit = 0; + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); + $item['content'] = $this->ExtractContent($item['uri']); + return $item; + } - foreach($html->find('item') as $element) { - if($limit < 10) { - $item = array(); - $item['title'] = $element->find('title', 0)->innertext; - $item['uri'] = $element->find('guid', 0)->plaintext; - $item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext); - $item['content'] = $this->ExtractContent($item['uri'], $context); - $this->items[] = $item; - $limit++; - } - } - } + private function ExtractContent($url) { + $html2 = $this->getSimpleHTMLDOM($url); + $text = $html2->find('div.column', 0)->innertext; + $text = preg_replace('@]*?>.*?@si', '', $text); + return $text; + } - public function getCacheDuration(){ - return 3600*2; // 2 hours - } + public function getCacheDuration(){ + return 3600*2; // 2 hours + } } ?> diff --git a/bridges/DeveloppezDotComBridge.php b/bridges/DeveloppezDotComBridge.php index 48e29741..52e52db1 100644 --- a/bridges/DeveloppezDotComBridge.php +++ b/bridges/DeveloppezDotComBridge.php @@ -1,11 +1,21 @@ collectExpandableDatas(self::URI . 'index/rss'); + } + + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); + $item['content'] = $this->DeveloppezDotComExtractContent($item['uri']); + return $item; + } + private function DeveloppezDotComStripCDATA($string) { $string = str_replace('', '', $string); @@ -32,31 +42,12 @@ class DeveloppezDotComBridge extends BridgeAbstract{ } private function DeveloppezDotComExtractContent($url) { - $articleHTMLContent = $this->getSimpleHTMLDOM($url); + $articleHTMLContent = $this->get_cached($url); $text = $this->convert_smart_quotes($articleHTMLContent->find('div.content', 0)->innertext); $text = utf8_encode($text); return trim($text); } - public function collectData(){ - $rssFeed = $this->getSimpleHTMLDOM(self::URI.'index/rss') - or $this->returnServerError('Could not request '.self::URI.'index/rss'); - $limit = 0; - - foreach($rssFeed->find('item') as $element) { - if($limit < 10) { - $item = array(); - $item['title'] = $this->DeveloppezDotComStripCDATA($element->find('title', 0)->innertext); - $item['uri'] = $this->DeveloppezDotComStripCDATA($element->find('guid', 0)->plaintext); - $item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext); - $content = $this->DeveloppezDotComExtractContent($item['uri']); - $item['content'] = strlen($content) ? $content : $element->description; //In case of it is a tutorial, we just keep the original description - $this->items[] = $item; - $limit++; - } - } - } - public function getCacheDuration(){ return 1800; // 30min } diff --git a/bridges/FuturaSciencesBridge.php b/bridges/FuturaSciencesBridge.php index e4c8471f..beff9c8a 100644 --- a/bridges/FuturaSciencesBridge.php +++ b/bridges/FuturaSciencesBridge.php @@ -1,5 +1,5 @@ getInput('feed') . '.xml'; + $this->collectExpandableDatas($url); + } - function StripCDATA($string) { - $string = str_replace('', '', $string); - return $string; - } + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); + $item['uri'] = str_replace('#xtor=RSS-8', '', $item['uri']); + $article = $this->get_cached($item['uri']) + or $this->returnServerError('Could not request Futura-Sciences: ' . $item['uri']); + $item['content'] = $this->ExtractArticleContent($article); + $item['author'] = empty($this->ExtractAuthor($article)) ? $item['author'] : $this->ExtractAuthor($article); + return $item; + } - function StripWithDelimiters($string, $start, $end) { - while (strpos($string, $start) !== false) { - $section_to_remove = substr($string, strpos($string, $start)); - $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); + function StripWithDelimiters($string, $start, $end) { + while (strpos($string, $start) !== false) { + $section_to_remove = substr($string, strpos($string, $start)); + $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); + $string = str_replace($section_to_remove, '', $string); + } return $string; + } + + function StripRecursiveHTMLSection($string, $tag_name, $tag_start) { + $open_tag = '<'.$tag_name; + $close_tag = ''; + $close_tag_length = strlen($close_tag); + if (strpos($tag_start, $open_tag) === 0) { + while (strpos($string, $tag_start) !== false) { + $max_recursion = 100; + $section_to_remove = null; + $section_start = strpos($string, $tag_start); + $search_offset = $section_start; + do { + $max_recursion--; + $section_end = strpos($string, $close_tag, $search_offset); + $search_offset = $section_end + $close_tag_length; + $section_to_remove = substr($string, $section_start, $section_end - $section_start + $close_tag_length); + $open_tag_count = substr_count($section_to_remove, $open_tag); + $close_tag_count = substr_count($section_to_remove, $close_tag); + } while ($open_tag_count > $close_tag_count && $max_recursion > 0); $string = str_replace($section_to_remove, '', $string); - } return $string; - } - - function StripRecursiveHTMLSection($string, $tag_name, $tag_start) { - $open_tag = '<'.$tag_name; - $close_tag = ''; - $close_tag_length = strlen($close_tag); - if (strpos($tag_start, $open_tag) === 0) { - while (strpos($string, $tag_start) !== false) { - $max_recursion = 100; - $section_to_remove = null; - $section_start = strpos($string, $tag_start); - $search_offset = $section_start; - do { - $max_recursion--; - $section_end = strpos($string, $close_tag, $search_offset); - $search_offset = $section_end + $close_tag_length; - $section_to_remove = substr($string, $section_start, $section_end - $section_start + $close_tag_length); - $open_tag_count = substr_count($section_to_remove, $open_tag); - $close_tag_count = substr_count($section_to_remove, $close_tag); - } while ($open_tag_count > $close_tag_count && $max_recursion > 0); - $string = str_replace($section_to_remove, '', $string); - } - } - return $string; - } - - // Extracts the author from an article or element - function ExtractAuthor($article, $element){ - $article_author = $article->find('span.author', 0); - if($article_author){ - $authorname = trim(str_replace(', Futura-Sciences', '', $article_author->plaintext)); - if(empty($authorname)){ - $element_author = $element->find('author', 0); - if($element_author) - $authorname = StripCDATA($element_author->plaintext); - else - return ''; - } - return $authorname; - } - return ''; - } - - $url = $this->getURI().'rss/'.$this->getInput('feed').'.xml'; - - $html = $this->getSimpleHTMLDOM($url) - or $this->returnServerError('Could not request Futura-Sciences: '.$url); - $limit = 0; - - foreach($html->find('item') as $element) { - if ($limit < 10) { - $article_url = str_replace('#xtor=RSS-8', '', StripCDATA($element->find('guid', 0)->plaintext)); - $article = $this->getSimpleHTMLDOM($article_url) or $this->returnServerError('Could not request Futura-Sciences: '.$article_url); - $contents = $article->find('div.content', 0)->innertext; - - foreach (array( - '
'); - $contents = StripWithDelimiters($contents, '
StripRecursiveHTMLSection($contents , 'div', $div_start); + } + + $contents = $this->StripWithDelimiters($contents, '
'); + $contents = $this->StripWithDelimiters($contents, '

'); - return $article_html; - } + function StripWithDelimiters($string, $start, $end) { + while (strpos($string, $start) !== false) { + $section_to_remove = substr($string, strpos($string, $start)); + $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); + $string = str_replace($section_to_remove, '', $string); + } return $string; + } - $html = $this->getSimpleHTMLDOM(self::URI.'rss/rss.xml') - or $this->returnServerError('Could not request LeMondeInformatique: ' - .self::URI.'rss/rss.xml'); - $limit = 0; - - foreach($html->find('item') as $element) { - if($limit < 5) { - - //Retrieve article details - $article_uri = $element->innertext; - $article_uri = substr($article_uri, strpos($article_uri, '') + 6); - $article_uri = substr($article_uri, 0, strpos($article_uri, '')); - $article_html = $this->getSimpleHTMLDOM($article_uri) or $this->returnServerError('Could not request LeMondeInformatique: '.$article_uri); - $article_content = CleanArticle($article_html->find('div#article', 0)->innertext); - $article_title = $article_html->find('h1.cleanprint-title', 0)->plaintext; - - //Build and add final item - $item = array(); - $item['uri'] = $article_uri; - $item['title'] = $article_title; - $item['author'] = StripCDATA($element->find('dc:creator', 0)->innertext); - $item['timestamp'] = strtotime($element->find('dc:date', 0)->plaintext); - $item['content'] = $article_content; - $this->items[] = $item; - $limit++; - } - } + function CleanArticle($article_html) { + $article_html = $this->StripWithDelimiters($article_html, ''); + $article_html = $this->StripWithDelimiters($article_html, '

'); + return $article_html; } public function getCacheDuration() { diff --git a/bridges/LichessBridge.php b/bridges/LichessBridge.php index f74c2bde..57108bd9 100644 --- a/bridges/LichessBridge.php +++ b/bridges/LichessBridge.php @@ -1,39 +1,22 @@ getSimpleHTMLDOM(self::URI.'.atom') - or $this->returnServerError('Could not retrieve Lichess blog feed.'); - - $posts_loaded = 0; - foreach($xml_feed->find('entry') as $entry) - { - if ($posts_loaded < 5) - { - $item = array(); - - $item['title'] = html_entity_decode($entry->find('title', 0)->innertext); - $item['author'] = $entry->find('author', 0)->find('name', 0)->innertext; - $item['uri'] = $entry->find('id', 0)->plaintext; - $item['timestamp'] = strtotime($entry->find('published', 0)->plaintext); - - $item['content'] = $this->retrieve_lichess_post($item['uri']); - - $this->items[] = $item; - $posts_loaded++; - } - } + public function collectData(){ + $this->collectExpandableDatas(self::URI . '.atom'); } - private function retrieve_lichess_post($blog_post_uri) - { + protected function parseItem($newsItem){ + $item = $this->parseATOMItem($newsItem); + $item['content'] = $this->retrieve_lichess_post($item['uri']); + return $item; + } + + private function retrieve_lichess_post($blog_post_uri){ if($this->get_cached_time($blog_post_uri) <= strtotime('-24 hours')) $this->remove_from_cache($blog_post_uriuri); diff --git a/bridges/NextInpactBridge.php b/bridges/NextInpactBridge.php index 8c35753d..a24a02e4 100644 --- a/bridges/NextInpactBridge.php +++ b/bridges/NextInpactBridge.php @@ -1,19 +1,23 @@ ', '', $string); - return $string; + public function collectData(){ + $this->collectExpandableDatas(self::URI . 'rss/news.xml'); + } + + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); + $item['content'] = $this->ExtractContent($item['uri']); + return $item; } private function ExtractContent($url) { - $html2 = $this->getSimpleHTMLDOM($url); + $html2 = $this->get_cached($url); $text = '

'.$html2->find('span.sub_title', 0)->innertext.'

' .'

-

' .'
'.$html2->find('div[itemprop=articleBody]', 0)->innertext.'
'; @@ -22,22 +26,4 @@ class NextInpactBridge extends BridgeAbstract { $text = $text.'

'.$premium_article->innertext.'

'; return $text; } - - public function collectData(){ - $html = $this->getSimpleHTMLDOM(self::URI.'rss/news.xml') or $this->returnServerError('Could not request NextInpact.'); - $limit = 0; - - foreach($html->find('item') as $element) { - if($limit < 3) { - $item = array(); - $item['title'] = $this->StripCDATA($element->find('title', 0)->innertext); - $item['uri'] = $this->StripCDATA($element->find('guid', 0)->plaintext); - $item['author'] = $this->StripCDATA($element->find('creator', 0)->innertext); - $item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext); - $item['content'] = $this->ExtractContent($item['uri']); - $this->items[] = $item; - $limit++; - } - } - } } diff --git a/bridges/NextgovBridge.php b/bridges/NextgovBridge.php index ee4f2996..dee8c370 100644 --- a/bridges/NextgovBridge.php +++ b/bridges/NextgovBridge.php @@ -1,5 +1,5 @@ collectExpandableDatas(self::URI . 'rss/' . $this->getInput('category') . '/'); + } - function ExtractFromDelimiters($string, $start, $end) { - if (strpos($string, $start) !== false) { - $section_retrieved = substr($string, strpos($string, $start) + strlen($start)); - $section_retrieved = substr($section_retrieved, 0, strpos($section_retrieved, $end)); - return $section_retrieved; - } return false; - } + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); - function StripWithDelimiters($string, $start, $end) { - while (strpos($string, $start) !== false) { - $section_to_remove = substr($string, strpos($string, $start)); - $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); - $string = str_replace($section_to_remove, '', $string); - } return $string; - } + $item['content'] = ''; - $category = $this->getInput('category'); - $url = $this->getURI().'rss/'.$category.'/'; - $html = $this->getSimpleHTMLDOM($url) or $this->returnServerError('Could not request Nextgov: '.$url); - $limit = 0; - - foreach ($html->find('item') as $element) { - if ($limit >= 10) { - break; + $namespaces = $newsItem->getNamespaces(true); + if(isset($namespaces['media'])){ + $media = $newsItem->children($namespaces['media']); + if(isset($media->content)){ + $attributes = $media->content->attributes(); + $item['content'] = ''; } - - $article_url = ExtractFromDelimiters($element->innertext, '', ''); - $article_author = ExtractFromDelimiters($element->innertext, 'dc/elements/1.1/">', ''); - $article_title = $element->find('title', 0)->plaintext; - $article_subtitle = $element->find('description', 0)->plaintext; - $article_timestamp = strtotime($element->find('pubDate', 0)->plaintext); - $article_thumbnail = ExtractFromDelimiters($element->innertext, 'getSimpleHTMLDOM($article_url) or $this->returnServerError('Could not request Nextgov: '.$article_url); - - $contents = $article->find('div.wysiwyg', 0)->innertext; - $contents = StripWithDelimiters($contents, '
', '
'); - $contents = StripWithDelimiters($contents, ''); //ad outer div - $contents = StripWithDelimiters($contents, ''); - $contents = ($article_thumbnail == '' ? '' : '

') - .'

'.$article_subtitle.'

' - .trim($contents); - - $item = array(); - $item['uri'] = $article_url; - $item['title'] = $article_title; - $item['author'] = $article_author; - $item['timestamp'] = $article_timestamp; - $item['content'] = $contents; - $this->items[] = $item; - $limit++; } + + $item['content'] .= $this->ExtractContent($item['uri']); + return $item; + } + + private function StripWithDelimiters($string, $start, $end) { + while (strpos($string, $start) !== false) { + $section_to_remove = substr($string, strpos($string, $start)); + $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); + $string = str_replace($section_to_remove, '', $string); + } return $string; + } + + private function ExtractContent($url){ + $article = $this->get_cached($url) + or $this->returnServerError('Could not request Nextgov: ' . $url); + + $contents = $article->find('div.wysiwyg', 0)->innertext; + $contents = $this->StripWithDelimiters($contents, '
', '
'); + $contents = $this->StripWithDelimiters($contents, ''); //ad outer div + return $this->StripWithDelimiters($contents, ''); + $contents = ($article_thumbnail == '' ? '' : '

') + .'

'.$article_subtitle.'

' + .trim($contents); } } diff --git a/bridges/NiceMatinBridge.php b/bridges/NiceMatinBridge.php index 3c189090..0f9d011a 100644 --- a/bridges/NiceMatinBridge.php +++ b/bridges/NiceMatinBridge.php @@ -1,13 +1,23 @@ collectExpandableDatas(self::URI . 'derniere-minute/rss'); + } + + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); + $item['content'] = $this->NiceMatinExtractContent($item['uri']); + return $item; + } + private function NiceMatinExtractContent($url) { - $html = $this->getSimpleHTMLDOM($url); + $html = $this->get_cached($url); if(!$html) return 'Could not acquire content from url: ' . $url . '!'; @@ -19,29 +29,4 @@ class NiceMatinBridge extends BridgeAbstract{ $text = strip_tags($text, '

'); return $text; } - - public function collectData(){ - $html = $this->getSimpleHTMLDOM(self::URI.'derniere-minute/rss') - or $this->returnServerError('Could not request NiceMatin.'); - $limit = 0; - - foreach($html->find('item') as $element) { - if($limit >= 10) { - break; - } - // We need to fix the 'link' tag as simplehtmldom cannot parse it (just rename it and load back as dom) - $element_text = $element->outertext; - $element_text = str_replace('', '', $element_text); - $element_text = str_replace('', '', $element_text); - $element = str_get_html($element_text); - - $item = array(); - $item['title'] = $element->find('title', 0)->innertext; - $item['uri'] = $element->find('url', 0)->innertext; - $item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext); - $item['content'] = $this->NiceMatinExtractContent($item['uri']); - $this->items[] = $item; - $limit++; - } - } } diff --git a/bridges/NumeramaBridge.php b/bridges/NumeramaBridge.php index 1e80affb..48260a09 100644 --- a/bridges/NumeramaBridge.php +++ b/bridges/NumeramaBridge.php @@ -1,5 +1,5 @@ collectExpandableDatas(self::URI . 'feed/'); + } - function NumeramaStripCDATA($string) { - $string = str_replace('', '', $string); - return $string; - } + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); + $item['content'] = $this->ExtractContent($item['uri']); + return $item; + } - $feed = self::URI.'feed/'; - $html = $this->getSimpleHTMLDOM($feed) or $this->returnServerError('Could not request Numerama: '.$feed); - $limit = 0; + private function ExtractContent($url){ + if($this->get_cached_time($url) <= strtotime('-24 hours')) + $this->remove_from_cache($url); - foreach($html->find('item') as $element) { - if($limit < 5) { - $item = array(); - $item['title'] = html_entity_decode(NumeramaStripCDATA($element->find('title', 0)->innertext)); - $item['author'] = NumeramaStripCDATA($element->find('dc:creator', 0)->innertext); - $item['uri'] = NumeramaStripCDATA($element->find('guid', 0)->plaintext); - $item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext); - - $article_url = NumeramaStripCDATA($element->find('guid', 0)->plaintext); - if($this->get_cached_time($article_url) <= strtotime('-24 hours')) - $this->remove_from_cache($article_url); - - $article_html = $this->get_cached($article_url) or $this->returnServerError('Could not request Numerama: '.$article_url); - $contents = $article_html->find('section[class=related-article]', 0)->innertext = ''; // remove related articles block - $contents = ''; // add post picture - $contents = $contents.$article_html->find('article[class=post-content]', 0)->innertext; // extract the post - - $item['content'] = $contents; - $this->items[] = $item; - $limit++; - } - } + $article_html = $this->get_cached($url) or $this->returnServerError('Could not request Numerama: '.$url); + $contents = $article_html->find('section[class=related-article]', 0)->innertext = ''; // remove related articles block + $contents = ''; // add post picture + return $contents . $article_html->find('article[class=post-content]', 0)->innertext; // extract the post } public function getCacheDuration() {