From d6d0f422f58193acdef698c11f8e644753454efa Mon Sep 17 00:00:00 2001 From: DoumanAsh Date: Mon, 6 Apr 2015 08:36:41 +0300 Subject: [PATCH] [search engine] engines update --- src/searchengine/nova/engines/extratorrent.py | 205 +++++++++++------- .../nova/engines/legittorrents.py | 6 +- src/searchengine/nova/engines/mininova.py | 194 ++++++++++------- .../nova/engines/torrentreactor.py | 154 ++++++------- src/searchengine/nova/engines/versions.txt | 9 +- .../nova3/engines/extratorrent.py | 205 +++++++++++------- .../nova3/engines/legittorrents.py | 6 +- src/searchengine/nova3/engines/mininova.py | 194 ++++++++++------- .../nova3/engines/torrentreactor.py | 153 ++++++------- src/searchengine/nova3/engines/versions.txt | 9 +- 10 files changed, 644 insertions(+), 491 deletions(-) diff --git a/src/searchengine/nova/engines/extratorrent.py b/src/searchengine/nova/engines/extratorrent.py index 2956406f4..19fce553c 100644 --- a/src/searchengine/nova/engines/extratorrent.py +++ b/src/searchengine/nova/engines/extratorrent.py @@ -1,4 +1,4 @@ -#VERSION: 1.2 +#VERSION: 2.0 #AUTHORS: Christophe Dumez (chris@qbittorrent.org) # Redistribution and use in source and binary forms, with or without @@ -25,92 +25,135 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. - +from HTMLParser import HTMLParser +from httplib import HTTPConnection as http +#qBt from novaprinter import prettyPrinter -from helpers import retrieve_url, download_file -import sgmllib -import re +from helpers import download_file class extratorrent(object): - url = 'http://extratorrent.cc' - name = 'extratorrent' - supported_categories = {'all': '', 'movies': '4', 'tv': '8', 'music': '5', 'games': '3', 'anime': '1', 'software': '7', 'books': '2', 'pictures': '6'} + """ Search engine class """ + url = 'http://extratorrent.cc' + name = 'ExtraTorrent' + supported_categories = {'all' : '0', + 'movies' : '4', + 'tv' : '8', + 'music' : '5', + 'games' : '3', + 'anime' : '1', + 'software' : '7', + 'books' : '2', + 'pictures' : '6'} - def __init__(self): - self.results = [] - self.parser = self.SimpleSGMLParser(self.results, self.url) + def download_torrent(self, info): + """ Downloader """ + print(download_file(info)) - def download_torrent(self, info): - print download_file(info) + class MyHtmlParseWithBlackJack(HTMLParser): + """ Parser class """ + def __init__(self, list_searches, url): + HTMLParser.__init__(self) + self.url = url + self.list_searches = list_searches + self.current_item = None + self.cur_item_name = None + self.pending_size = False + self.next_queries = True + self.pending_next_queries = False - class SimpleSGMLParser(sgmllib.SGMLParser): - def __init__(self, results, url, *args): - sgmllib.SGMLParser.__init__(self) - self.url = url - self.td_counter = None - self.current_item = None - self.start_name = False - self.results = results - - def start_a(self, attr): - params = dict(attr) - #print params - if params.has_key('href') and params['href'].startswith("/torrent_download/"): - self.current_item = {} - self.td_counter = 0 - self.start_name = False - torrent_id = '/'.join(params['href'].split('/')[2:]) - self.current_item['link']=self.url+'/download/'+torrent_id - elif params.has_key('href') and params['href'].startswith("/torrent/") and params['href'].endswith(".html"): - self.current_item['desc_link'] = self.url + params['href'].strip() - self.start_name = True - - def handle_data(self, data): - if self.td_counter == 2: - if not self.current_item.has_key('name') and self.start_name: - self.current_item['name'] = data.strip() - elif self.td_counter == 3: - if not self.current_item.has_key('size'): - self.current_item['size'] = '' - self.current_item['size']+= data.replace(" ", " ").strip() - elif self.td_counter == 4: - if not self.current_item.has_key('seeds'): - self.current_item['seeds'] = '' - self.current_item['seeds']+= data.strip() - elif self.td_counter == 5: - if not self.current_item.has_key('leech'): - self.current_item['leech'] = '' - self.current_item['leech']+= data.strip() - - def start_td(self,attr): - if isinstance(self.td_counter,int): - self.td_counter += 1 - if self.td_counter > 5: - self.td_counter = None - # Display item + def handle_starttag(self, tag, attrs): if self.current_item: - self.current_item['engine_url'] = self.url - if not self.current_item['seeds'].isdigit(): - self.current_item['seeds'] = 0 - if not self.current_item['leech'].isdigit(): - self.current_item['leech'] = 0 - prettyPrinter(self.current_item) - self.results.append('a') + if tag == "a": + params = dict(attrs) + link = params['href'] - def search(self, what, cat='all'): - ret = [] - i = 1 - while True and i<11: - results = [] - parser = self.SimpleSGMLParser(results, self.url) - dat = retrieve_url(self.url+'/advanced_search/?with=%s&s_cat=%s&page=%d'%(what, self.supported_categories[cat], i)) - results_re = re.compile('(?s).*') - for match in results_re.finditer(dat): - res_tab = match.group(0) - parser.feed(res_tab) + if not link.startswith("/torrent"): + return + + if link[8] == "/": + #description + self.current_item["desc_link"] = "".join((self.url, link)) + #remove view at the beginning + self.current_item["name"] = params["title"][5:] + self.pending_size = True + elif link[8] == "_": + #download link + link = link.replace("torrent_", "", 1) + self.current_item["link"] = "".join((self.url, link)) + + elif tag == "td": + if self.pending_size: + self.cur_item_name = "size" + self.current_item["size"] = "" + self.pending_size = False + + for attr in attrs: + if attr[0] == "class": + if attr[1][0] == "s": + self.cur_item_name = "seeds" + self.current_item["seeds"] = "" + elif attr[1][0] == "l": + self.cur_item_name = "leech" + self.current_item["leech"] = "" + break + + + elif tag == "tr": + for attr in attrs: + if attr[0] == "class" and attr[1].startswith("tl"): + self.current_item = dict() + self.current_item["engine_url"] = self.url + break + + elif self.pending_next_queries: + if tag == "a": + params = dict(attrs) + self.list_searches.append(params['href']) + if params["title"] == "10": + self.pending_next_queries = False + else: + self.pending_next_queries = False + + elif self.next_queries: + if tag == "b" and ("class", "pager_no_link") in attrs: + self.next_queries = False + self.pending_next_queries = True + + def handle_data(self, data): + if self.cur_item_name: + temp = self.current_item[self.cur_item_name] + self.current_item[self.cur_item_name] = " ".join((temp, data)) + #Due to utf-8 we need to handle data two times if there is space + if not self.cur_item_name == "size": + self.cur_item_name = None + + def handle_endtag(self, tag): + if self.current_item: + if tag == "tr": + prettyPrinter(self.current_item) + self.current_item = None + + def search(self, what, cat="all"): + """ Performs search """ + connection = http("extratorrent.cc") + + query = "".join(("/search/?new=1&search=", what, "&s_cat=", self.supported_categories[cat])) + + connection.request("GET", query) + response = connection.getresponse() + if response.status != 200: + return + + list_searches = [] + parser = self.MyHtmlParseWithBlackJack(list_searches, self.url) + parser.feed(response.read().decode('utf-8')) parser.close() - break - if len(results) <= 0: - break - i += 1 - + + for search_query in list_searches: + connection.request("GET", search_query) + response = connection.getresponse() + parser.feed(response.read().decode('utf-8')) + parser.close() + + connection.close() + return diff --git a/src/searchengine/nova/engines/legittorrents.py b/src/searchengine/nova/engines/legittorrents.py index be083053e..a6b9b6f18 100644 --- a/src/searchengine/nova/engines/legittorrents.py +++ b/src/searchengine/nova/engines/legittorrents.py @@ -1,4 +1,4 @@ -#VERSION: 1.02 +#VERSION: 1.03 #AUTHORS: Christophe Dumez (chris@qbittorrent.org) # Redistribution and use in source and binary forms, with or without @@ -36,10 +36,6 @@ class legittorrents(object): name = 'legittorrents' supported_categories = {'all': '', 'movies': '1', 'tv': '13', 'music': '2', 'games': '3', 'anime': '5', 'books': '6'} - def __init__(self): - self.results = [] - self.parser = self.SimpleSGMLParser(self.results, self.url) - def download_torrent(self, info): print download_file(info) diff --git a/src/searchengine/nova/engines/mininova.py b/src/searchengine/nova/engines/mininova.py index 5355b0ec7..dc132cd6c 100644 --- a/src/searchengine/nova/engines/mininova.py +++ b/src/searchengine/nova/engines/mininova.py @@ -1,4 +1,4 @@ -#VERSION: 1.51 +#VERSION: 2.00 #AUTHORS: Christophe Dumez (chris@qbittorrent.org) #CONTRIBUTORS: Diego de las Heras (diegodelasheras@gmail.com) @@ -26,90 +26,124 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. +from HTMLParser import HTMLParser +from httplib import HTTPConnection as http from novaprinter import prettyPrinter -from helpers import retrieve_url, download_file -import sgmllib -import re +from helpers import download_file class mininova(object): - # Mandatory properties - url = 'http://www.mininova.org' - name = 'Mininova' - supported_categories = {'all': '0', 'movies': '4', 'tv': '8', 'music': '5', 'games': '3', 'anime': '1', 'software': '7', 'pictures': '6', 'books': '2'} + """ Search engine class """ + url = 'http://www.mininova.org' + name = 'Mininova' + supported_categories = {'all' : '0', + 'movies' : '4', + 'tv' : '8', + 'music' : '5', + 'games' : '3', + 'anime' : '1', + 'software' : '7', + 'pictures' : '6', + 'books' : '2'} - def __init__(self): - self.results = [] - self.parser = self.SimpleSGMLParser(self.results, self.url) + def download_torrent(self, info): + print(download_file(info)) - def download_torrent(self, info): - print download_file(info) + class MyHtmlParseWithBlackJack(HTMLParser): + """ Parser class """ + def __init__(self, list_searches, url): + HTMLParser.__init__(self) + self.list_searches = list_searches + self.url = url + self.table_results = False + self.current_item = None + self.cur_item_name = None + self.next_queries = True - class SimpleSGMLParser(sgmllib.SGMLParser): - def __init__(self, results, url, *args): - sgmllib.SGMLParser.__init__(self) - self.url = url - self.td_counter = None - self.current_item = None - self.results = results - - def start_a(self, attr): - params = dict(attr) - #print params - if params.has_key('href'): - if params['href'].startswith("/get/"): - self.current_item = {} - self.td_counter = 0 - self.current_item['link']=self.url+params['href'].strip() - elif params['href'].startswith("/tor/") and self.current_item is not None: - self.current_item['desc_link']=self.url+params['href'].strip() - - def handle_data(self, data): - if self.td_counter == 0: - if not self.current_item.has_key('name'): - self.current_item['name'] = '' - self.current_item['name']+= data - elif self.td_counter == 1: - if not self.current_item.has_key('size'): - self.current_item['size'] = '' - self.current_item['size']+= data.strip() - elif self.td_counter == 2: - if not self.current_item.has_key('seeds'): - self.current_item['seeds'] = '' - self.current_item['seeds']+= data.strip() - elif self.td_counter == 3: - if not self.current_item.has_key('leech'): - self.current_item['leech'] = '' - self.current_item['leech']+= data.strip() - - def start_td(self,attr): - if isinstance(self.td_counter,int): - self.td_counter += 1 - if self.td_counter > 4: - self.td_counter = None - # Display item - if self.current_item: - self.current_item['engine_url'] = self.url - if not self.current_item['seeds'].isdigit(): - self.current_item['seeds'] = 0 - if not self.current_item['leech'].isdigit(): - self.current_item['leech'] = 0 - prettyPrinter(self.current_item) - self.results.append('a') + def handle_starttag_tr(self, _): + """ Handler of tr start tag """ + self.current_item = dict() - def search(self, what, cat='all'): - ret = [] - i = 1 - while True and i<11: - results = [] - parser = self.SimpleSGMLParser(results, self.url) - dat = retrieve_url(self.url+'/search/%s/%s/seeds/%d'%(what, self.supported_categories[cat], i)) - results_re = re.compile('(?s)

Search results for.*') - for match in results_re.finditer(dat): - res_tab = match.group(0) - parser.feed(res_tab) + def handle_starttag_a(self, attrs): + """ Handler of a start tag """ + params = dict(attrs) + link = params["href"] + + if link.startswith("/get/"): + #download link + self.current_item["link"] = "".join((self.url, link)) + elif link.startswith("/tor/"): + #description + self.current_item["desc_link"] = "".join((self.url, link)) + self.cur_item_name = "name" + self.current_item["name"] = "" + elif self.next_queries and link.startswith("/search"): + if params["title"].startswith("Page"): + self.list_searches.append(link) + + def handle_starttag_td(self, attrs): + """ Handler of td start tag """ + if ("align", "right") in attrs: + if not "size" in self.current_item.keys(): + self.cur_item_name = "size" + self.current_item["size"] = "" + + def handle_starttag_span(self, attrs): + """ Handler of span start tag """ + if ("class", "g") in attrs: + self.cur_item_name = "seeds" + self.current_item["seeds"] = "" + elif ("class", "b") in attrs: + self.cur_item_name = "leech" + self.current_item["leech"] = "" + + def handle_starttag(self, tag, attrs): + """ Parser's start tag handler """ + if self.table_results: + dispatcher = getattr(self, "_".join(("handle_starttag", tag)), None) + if dispatcher: + dispatcher(attrs) + + elif tag == "table": + self.table_results = ("class", "maintable") in attrs + + def handle_endtag(self, tag): + """ Parser's end tag handler """ + if tag == "tr" and self.current_item: + self.current_item["engine_url"] = self.url + prettyPrinter(self.current_item) + self.current_item = None + elif self.cur_item_name: + if tag == "a" or tag == "span": + self.cur_item_name = None + + def handle_data(self, data): + """ Parser's data handler """ + if self.cur_item_name: + temp = self.current_item[self.cur_item_name] + self.current_item[self.cur_item_name] = " ".join((temp, data)) + + def search(self, what, cat="all"): + """ Performs search """ + connection = http("www.mininova.org") + + query = "/".join(("/search", what, self.supported_categories[cat], "seeds")) + + connection.request("GET", query) + response = connection.getresponse() + if response.status != 200: + return + + list_searches = [] + parser = self.MyHtmlParseWithBlackJack(list_searches, self.url) + parser.feed(response.read().decode('utf-8')) parser.close() - break - if len(results) <= 0: - break - i += 1 - + + parser.next_queries = False + for search_query in list_searches: + connection.request("GET", search_query) + response = connection.getresponse() + parser.feed(response.read().decode('utf-8')) + parser.close() + + connection.close() + return diff --git a/src/searchengine/nova/engines/torrentreactor.py b/src/searchengine/nova/engines/torrentreactor.py index ee74f4e75..dff7d35f2 100644 --- a/src/searchengine/nova/engines/torrentreactor.py +++ b/src/searchengine/nova/engines/torrentreactor.py @@ -1,4 +1,4 @@ -#VERSION: 1.33 +#VERSION: 1.35 #AUTHORS: Gekko Dam Beer (gekko04@users.sourceforge.net) #CONTRIBUTORS: Christophe Dumez (chris@qbittorrent.org) # Bruno Barbieri (brunorex@gmail.com) @@ -28,92 +28,94 @@ # POSSIBILITY OF SUCH DAMAGE. from novaprinter import prettyPrinter -from helpers import retrieve_url, download_file -from urllib2 import HTTPError -from HTMLParser import HTMLParser +from helpers import download_file import urllib +from HTMLParser import HTMLParser +from httplib import HTTPConnection as http import re class torrentreactor(object): - url = 'http://www.torrentreactor.net' - name = 'TorrentReactor.Net' - supported_categories = {'all': '', 'movies': '5', 'tv': '8', 'music': '6', 'games': '3', 'anime': '1', 'software': '2'} + url = 'http://www.torrentreactor.net' + name = 'TorrentReactor.Net' + supported_categories = {'all': '', 'movies': '5', 'tv': '8', 'music': '6', 'games': '3', 'anime': '1', 'software': '2'} - def download_torrent(self, info): - print download_file(info) + def download_torrent(self, info): + print(download_file(info)) - class SimpleHTMLParser(HTMLParser): - def __init__(self, results, url, *args): - HTMLParser.__init__(self) - self.td_counter = None - self.current_item = None - self.results = results - self.id = None - self.url = url - self.dispatcher = { 'a' : self.start_a, 'td' : self.start_td } + class SimpleHTMLParser(HTMLParser): + def __init__(self, results, url, *args): + HTMLParser.__init__(self) + self.td_counter = None + self.current_item = None + self.results = results + self.id = None + self.url = url + self.dispatcher = { 'a' : self.start_a, 'td' : self.start_td } - def handle_starttag(self, tag, attrs): - if tag in self.dispatcher: - self.dispatcher[tag](attrs) + def handle_starttag(self, tag, attrs): + if tag in self.dispatcher: + self.dispatcher[tag](attrs) - def start_a(self, attr): - params = dict(attr) - if re.match("/torrents/\d+.*", params['href']): - self.current_item = {} - self.current_item['desc_link'] = self.url+params['href'].strip() - elif 'torrentreactor.net/download.php' in params['href']: - self.td_counter = 0 - self.current_item['link'] = params['href'].strip() - self.current_item['name'] = urllib.unquote_plus(params['href'].split('&')[1].split('name=')[1]) + def start_a(self, attr): + params = dict(attr) + if re.match("/torrents/\d+.*", params['href']): + self.current_item = {} + self.current_item['desc_link'] = self.url+params['href'].strip() + elif 'torrentreactor.net/download.php' in params['href']: + self.td_counter = 0 + self.current_item['link'] = params['href'].strip() + self.current_item['name'] = urllib.unquote_plus(params['href'].split('&')[1].split('name=')[1]) - def handle_data(self, data): - if self.td_counter == 1: - if not self.current_item.has_key('size'): - self.current_item['size'] = '' - self.current_item['size']+= data.strip() - elif self.td_counter == 2: - if not self.current_item.has_key('seeds'): - self.current_item['seeds'] = '' - self.current_item['seeds']+= data.strip() - elif self.td_counter == 3: - if not self.current_item.has_key('leech'): - self.current_item['leech'] = '' - self.current_item['leech']+= data.strip() + def handle_data(self, data): + if self.td_counter == 1: + if 'size' not in self.current_item: + self.current_item['size'] = '' + self.current_item['size']+= data.strip() + elif self.td_counter == 2: + if 'seeds' not in self.current_item: + self.current_item['seeds'] = '' + self.current_item['seeds']+= data.strip() + elif self.td_counter == 3: + if 'leech' not in self.current_item: + self.current_item['leech'] = '' + self.current_item['leech']+= data.strip() - def start_td(self,attr): - if isinstance(self.td_counter,int): - self.td_counter += 1 - if self.td_counter > 3: - self.td_counter = None - # add item to results - if self.current_item: - self.current_item['engine_url'] = self.url - if not self.current_item['seeds'].isdigit(): - self.current_item['seeds'] = 0 - if not self.current_item['leech'].isdigit(): - self.current_item['leech'] = 0 - prettyPrinter(self.current_item) - self.has_results = True - self.results.append('a') + def start_td(self,attr): + if isinstance(self.td_counter,int): + self.td_counter += 1 + if self.td_counter > 3: + self.td_counter = None + # add item to results + if self.current_item: + self.current_item['engine_url'] = self.url + if not self.current_item['seeds'].isdigit(): + self.current_item['seeds'] = 0 + if not self.current_item['leech'].isdigit(): + self.current_item['leech'] = 0 + prettyPrinter(self.current_item) + self.has_results = True + self.results.append('a') - def __init__(self): - self.results = [] - self.parser = self.SimpleHTMLParser(self.results, self.url) + def search(self, what, cat='all'): + i = 0 + dat = '' + connection = http("www.torrentreactor.net") - def search(self, what, cat='all'): - i = 0 - dat = '' - while True and i<11: - results = [] - parser = self.SimpleHTMLParser(results, self.url) + while True and i<11: + results = [] + parser = self.SimpleHTMLParser(results, self.url) + query = '/torrents-search/%s/%d?sort=seeders.desc&type=all&period=none&categories=%s'%(what, (i*35), self.supported_categories[cat]) + connection.request("GET", query) + response = connection.getresponse() + if response.status != 200: + break - try: - dat = retrieve_url(self.url+'/torrent-search/%s/%s?sort=seeders.desc&type=all&period=none&categories=%s'%(what, (i*35), self.supported_categories[cat])) - except HTTPError: - break + dat = response.read().decode('utf-8') - parser.feed(dat) - parser.close() - if len(results) <= 0: - break - i += 1 + parser.feed(dat) + parser.close() + if len(results) <= 0: + break + i += 1 + + connection.close() diff --git a/src/searchengine/nova/engines/versions.txt b/src/searchengine/nova/engines/versions.txt index d581a676f..77fb875d5 100644 --- a/src/searchengine/nova/engines/versions.txt +++ b/src/searchengine/nova/engines/versions.txt @@ -1,8 +1,9 @@ -torrentreactor: 1.33 -mininova: 1.51 -piratebay: 2.11 extratorrent: 1.2 +torrentreactor: 1.35 +mininova: 2.00 +piratebay: 2.11 +extratorrent: 2.0 kickasstorrents: 1.26 btdigg: 1.24 -legittorrents: 1.02 torrentz: 2.13 +legittorrents: 1.03 diff --git a/src/searchengine/nova3/engines/extratorrent.py b/src/searchengine/nova3/engines/extratorrent.py index df1ef9b24..de3dcb9a2 100644 --- a/src/searchengine/nova3/engines/extratorrent.py +++ b/src/searchengine/nova3/engines/extratorrent.py @@ -1,4 +1,4 @@ -#VERSION: 1.2 +#VERSION: 2.0 #AUTHORS: Christophe Dumez (chris@qbittorrent.org) # Redistribution and use in source and binary forms, with or without @@ -25,92 +25,135 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. - +from html.parser import HTMLParser +from http.client import HTTPConnection as http +#qBt from novaprinter import prettyPrinter -from helpers import retrieve_url, download_file -import sgmllib3 -import re +from helpers import download_file class extratorrent(object): - url = 'http://extratorrent.cc' - name = 'extratorrent' - supported_categories = {'all': '', 'movies': '4', 'tv': '8', 'music': '5', 'games': '3', 'anime': '1', 'software': '7', 'books': '2', 'pictures': '6'} + """ Search engine class """ + url = 'http://extratorrent.cc' + name = 'ExtraTorrent' + supported_categories = {'all' : '0', + 'movies' : '4', + 'tv' : '8', + 'music' : '5', + 'games' : '3', + 'anime' : '1', + 'software' : '7', + 'books' : '2', + 'pictures' : '6'} - def __init__(self): - self.results = [] - self.parser = self.SimpleSGMLParser(self.results, self.url) + def download_torrent(self, info): + """ Downloader """ + print(download_file(info)) - def download_torrent(self, info): - print(download_file(info)) + class MyHtmlParseWithBlackJack(HTMLParser): + """ Parser class """ + def __init__(self, list_searches, url): + HTMLParser.__init__(self) + self.url = url + self.list_searches = list_searches + self.current_item = None + self.cur_item_name = None + self.pending_size = False + self.next_queries = True + self.pending_next_queries = False - class SimpleSGMLParser(sgmllib3.SGMLParser): - def __init__(self, results, url, *args): - sgmllib3.SGMLParser.__init__(self) - self.url = url - self.td_counter = None - self.current_item = None - self.start_name = False - self.results = results - - def start_a(self, attr): - params = dict(attr) - #print params - if 'href' in params and params['href'].startswith("/torrent_download/"): - self.current_item = {} - self.td_counter = 0 - self.start_name = False - torrent_id = '/'.join(params['href'].split('/')[2:]) - self.current_item['link']=self.url+'/download/'+torrent_id - elif 'href' in params and params['href'].startswith("/torrent/") and params['href'].endswith(".html"): - self.current_item['desc_link'] = self.url + params['href'].strip() - self.start_name = True - - def handle_data(self, data): - if self.td_counter == 2: - if 'name' not in self.current_item and self.start_name: - self.current_item['name'] = data.strip() - elif self.td_counter == 3: - if 'size' not in self.current_item: - self.current_item['size'] = '' - self.current_item['size']+= data.replace(" ", " ").strip() - elif self.td_counter == 4: - if 'seeds' not in self.current_item: - self.current_item['seeds'] = '' - self.current_item['seeds']+= data.strip() - elif self.td_counter == 5: - if 'leech' not in self.current_item: - self.current_item['leech'] = '' - self.current_item['leech']+= data.strip() - - def start_td(self,attr): - if isinstance(self.td_counter,int): - self.td_counter += 1 - if self.td_counter > 5: - self.td_counter = None - # Display item + def handle_starttag(self, tag, attrs): if self.current_item: - self.current_item['engine_url'] = self.url - if not self.current_item['seeds'].isdigit(): - self.current_item['seeds'] = 0 - if not self.current_item['leech'].isdigit(): - self.current_item['leech'] = 0 - prettyPrinter(self.current_item) - self.results.append('a') + if tag == "a": + params = dict(attrs) + link = params['href'] - def search(self, what, cat='all'): - ret = [] - i = 1 - while True and i<11: - results = [] - parser = self.SimpleSGMLParser(results, self.url) - dat = retrieve_url(self.url+'/advanced_search/?with=%s&s_cat=%s&page=%d'%(what, self.supported_categories[cat], i)) - results_re = re.compile('(?s)

.*') - for match in results_re.finditer(dat): - res_tab = match.group(0) - parser.feed(res_tab) + if not link.startswith("/torrent"): + return + + if link[8] == "/": + #description + self.current_item["desc_link"] = "".join((self.url, link)) + #remove view at the beginning + self.current_item["name"] = params["title"][5:] + self.pending_size = True + elif link[8] == "_": + #download link + link = link.replace("torrent_", "", 1) + self.current_item["link"] = "".join((self.url, link)) + + elif tag == "td": + if self.pending_size: + self.cur_item_name = "size" + self.current_item["size"] = "" + self.pending_size = False + + for attr in attrs: + if attr[0] == "class": + if attr[1][0] == "s": + self.cur_item_name = "seeds" + self.current_item["seeds"] = "" + elif attr[1][0] == "l": + self.cur_item_name = "leech" + self.current_item["leech"] = "" + break + + + elif tag == "tr": + for attr in attrs: + if attr[0] == "class" and attr[1].startswith("tl"): + self.current_item = dict() + self.current_item["engine_url"] = self.url + break + + elif self.pending_next_queries: + if tag == "a": + params = dict(attrs) + self.list_searches.append(params['href']) + if params["title"] == "10": + self.pending_next_queries = False + else: + self.pending_next_queries = False + + elif self.next_queries: + if tag == "b" and ("class", "pager_no_link") in attrs: + self.next_queries = False + self.pending_next_queries = True + + def handle_data(self, data): + if self.cur_item_name: + temp = self.current_item[self.cur_item_name] + self.current_item[self.cur_item_name] = " ".join((temp, data)) + #Due to utf-8 we need to handle data two times if there is space + if not self.cur_item_name == "size": + self.cur_item_name = None + + def handle_endtag(self, tag): + if self.current_item: + if tag == "tr": + prettyPrinter(self.current_item) + self.current_item = None + + def search(self, what, cat="all"): + """ Performs search """ + connection = http("extratorrent.cc") + + query = "".join(("/search/?new=1&search=", what, "&s_cat=", self.supported_categories[cat])) + + connection.request("GET", query) + response = connection.getresponse() + if response.status != 200: + return + + list_searches = [] + parser = self.MyHtmlParseWithBlackJack(list_searches, self.url) + parser.feed(response.read().decode('utf-8')) parser.close() - break - if len(results) <= 0: - break - i += 1 - + + for search_query in list_searches: + connection.request("GET", search_query) + response = connection.getresponse() + parser.feed(response.read().decode('utf-8')) + parser.close() + + connection.close() + return diff --git a/src/searchengine/nova3/engines/legittorrents.py b/src/searchengine/nova3/engines/legittorrents.py index 290852f0d..60297c574 100644 --- a/src/searchengine/nova3/engines/legittorrents.py +++ b/src/searchengine/nova3/engines/legittorrents.py @@ -1,4 +1,4 @@ -#VERSION: 1.03 +#VERSION: 1.04 #AUTHORS: Christophe Dumez (chris@qbittorrent.org) # Redistribution and use in source and binary forms, with or without @@ -36,10 +36,6 @@ class legittorrents(object): name = 'legittorrents' supported_categories = {'all': '', 'movies': '1', 'tv': '13', 'music': '2', 'games': '3', 'anime': '5', 'books': '6'} - def __init__(self): - self.results = [] - self.parser = self.SimpleSGMLParser(self.results, self.url) - def download_torrent(self, info): print(download_file(info)) diff --git a/src/searchengine/nova3/engines/mininova.py b/src/searchengine/nova3/engines/mininova.py index 96d6ed8e5..12544db09 100644 --- a/src/searchengine/nova3/engines/mininova.py +++ b/src/searchengine/nova3/engines/mininova.py @@ -1,4 +1,4 @@ -#VERSION: 1.51 +#VERSION: 2.00 #AUTHORS: Christophe Dumez (chris@qbittorrent.org) #CONTRIBUTORS: Diego de las Heras (diegodelasheras@gmail.com) @@ -26,90 +26,124 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. +from html.parser import HTMLParser +from http.client import HTTPConnection as http from novaprinter import prettyPrinter -from helpers import retrieve_url, download_file -import sgmllib3 -import re +from helpers import download_file class mininova(object): - # Mandatory properties - url = 'http://www.mininova.org' - name = 'Mininova' - supported_categories = {'all': '0', 'movies': '4', 'tv': '8', 'music': '5', 'games': '3', 'anime': '1', 'software': '7', 'pictures': '6', 'books': '2'} + """ Search engine class """ + url = 'http://www.mininova.org' + name = 'Mininova' + supported_categories = {'all' : '0', + 'movies' : '4', + 'tv' : '8', + 'music' : '5', + 'games' : '3', + 'anime' : '1', + 'software' : '7', + 'pictures' : '6', + 'books' : '2'} - def __init__(self): - self.results = [] - self.parser = self.SimpleSGMLParser(self.results, self.url) + def download_torrent(self, info): + print(download_file(info)) - def download_torrent(self, info): - print(download_file(info)) + class MyHtmlParseWithBlackJack(HTMLParser): + """ Parser class """ + def __init__(self, list_searches, url): + HTMLParser.__init__(self) + self.list_searches = list_searches + self.url = url + self.table_results = False + self.current_item = None + self.cur_item_name = None + self.next_queries = True - class SimpleSGMLParser(sgmllib3.SGMLParser): - def __init__(self, results, url, *args): - sgmllib3.SGMLParser.__init__(self) - self.url = url - self.td_counter = None - self.current_item = None - self.results = results - - def start_a(self, attr): - params = dict(attr) - #print params - if 'href' in params: - if params['href'].startswith("/get/"): - self.current_item = {} - self.td_counter = 0 - self.current_item['link']=self.url+params['href'].strip() - elif params['href'].startswith("/tor/") and self.current_item is not None: - self.current_item['desc_link']=self.url+params['href'].strip() - - def handle_data(self, data): - if self.td_counter == 0: - if 'name' not in self.current_item: - self.current_item['name'] = '' - self.current_item['name']+= data - elif self.td_counter == 1: - if 'size' not in self.current_item: - self.current_item['size'] = '' - self.current_item['size']+= data.strip() - elif self.td_counter == 2: - if 'seeds' not in self.current_item: - self.current_item['seeds'] = '' - self.current_item['seeds']+= data.strip() - elif self.td_counter == 3: - if 'leech' not in self.current_item: - self.current_item['leech'] = '' - self.current_item['leech']+= data.strip() - - def start_td(self,attr): - if isinstance(self.td_counter,int): - self.td_counter += 1 - if self.td_counter > 4: - self.td_counter = None - # Display item - if self.current_item: - self.current_item['engine_url'] = self.url - if not self.current_item['seeds'].isdigit(): - self.current_item['seeds'] = 0 - if not self.current_item['leech'].isdigit(): - self.current_item['leech'] = 0 - prettyPrinter(self.current_item) - self.results.append('a') + def handle_starttag_tr(self, _): + """ Handler of tr start tag """ + self.current_item = dict() - def search(self, what, cat='all'): - ret = [] - i = 1 - while True and i<11: - results = [] - parser = self.SimpleSGMLParser(results, self.url) - dat = retrieve_url(self.url+'/search/%s/%s/seeds/%d'%(what, self.supported_categories[cat], i)) - results_re = re.compile('(?s)

Search results for.*') - for match in results_re.finditer(dat): - res_tab = match.group(0) - parser.feed(res_tab) + def handle_starttag_a(self, attrs): + """ Handler of a start tag """ + params = dict(attrs) + link = params["href"] + + if link.startswith("/get/"): + #download link + self.current_item["link"] = "".join((self.url, link)) + elif link.startswith("/tor/"): + #description + self.current_item["desc_link"] = "".join((self.url, link)) + self.cur_item_name = "name" + self.current_item["name"] = "" + elif self.next_queries and link.startswith("/search"): + if params["title"].startswith("Page"): + self.list_searches.append(link) + + def handle_starttag_td(self, attrs): + """ Handler of td start tag """ + if ("align", "right") in attrs: + if not "size" in self.current_item.keys(): + self.cur_item_name = "size" + self.current_item["size"] = "" + + def handle_starttag_span(self, attrs): + """ Handler of span start tag """ + if ("class", "g") in attrs: + self.cur_item_name = "seeds" + self.current_item["seeds"] = "" + elif ("class", "b") in attrs: + self.cur_item_name = "leech" + self.current_item["leech"] = "" + + def handle_starttag(self, tag, attrs): + """ Parser's start tag handler """ + if self.table_results: + dispatcher = getattr(self, "_".join(("handle_starttag", tag)), None) + if dispatcher: + dispatcher(attrs) + + elif tag == "table": + self.table_results = ("class", "maintable") in attrs + + def handle_endtag(self, tag): + """ Parser's end tag handler """ + if tag == "tr" and self.current_item: + self.current_item["engine_url"] = self.url + prettyPrinter(self.current_item) + self.current_item = None + elif self.cur_item_name: + if tag == "a" or tag == "span": + self.cur_item_name = None + + def handle_data(self, data): + """ Parser's data handler """ + if self.cur_item_name: + temp = self.current_item[self.cur_item_name] + self.current_item[self.cur_item_name] = " ".join((temp, data)) + + def search(self, what, cat="all"): + """ Performs search """ + connection = http("www.mininova.org") + + query = "/".join(("/search", what, self.supported_categories[cat], "seeds")) + + connection.request("GET", query) + response = connection.getresponse() + if response.status != 200: + return + + list_searches = [] + parser = self.MyHtmlParseWithBlackJack(list_searches, self.url) + parser.feed(response.read().decode('utf-8')) parser.close() - break - if len(results) <= 0: - break - i += 1 - + + parser.next_queries = False + for search_query in list_searches: + connection.request("GET", search_query) + response = connection.getresponse() + parser.feed(response.read().decode('utf-8')) + parser.close() + + connection.close() + return diff --git a/src/searchengine/nova3/engines/torrentreactor.py b/src/searchengine/nova3/engines/torrentreactor.py index a099ec5ab..da6391cba 100644 --- a/src/searchengine/nova3/engines/torrentreactor.py +++ b/src/searchengine/nova3/engines/torrentreactor.py @@ -1,4 +1,4 @@ -#VERSION: 1.33 +#VERSION: 1.35 #AUTHORS: Gekko Dam Beer (gekko04@users.sourceforge.net) #CONTRIBUTORS: Christophe Dumez (chris@qbittorrent.org) # Bruno Barbieri (brunorex@gmail.com) @@ -28,91 +28,94 @@ # POSSIBILITY OF SUCH DAMAGE. from novaprinter import prettyPrinter -from helpers import retrieve_url, download_file -from urllib import error, parse +from helpers import download_file +from urllib import parse from html.parser import HTMLParser +from http.client import HTTPConnection as http import re class torrentreactor(object): - url = 'http://www.torrentreactor.net' - name = 'TorrentReactor.Net' - supported_categories = {'all': '', 'movies': '5', 'tv': '8', 'music': '6', 'games': '3', 'anime': '1', 'software': '2'} + url = 'http://www.torrentreactor.net' + name = 'TorrentReactor.Net' + supported_categories = {'all': '', 'movies': '5', 'tv': '8', 'music': '6', 'games': '3', 'anime': '1', 'software': '2'} - def download_torrent(self, info): - print(download_file(info)) + def download_torrent(self, info): + print(download_file(info)) - class SimpleHTMLParser(HTMLParser): - def __init__(self, results, url, *args): - HTMLParser.__init__(self) - self.td_counter = None - self.current_item = None - self.results = results - self.id = None - self.url = url - self.dispatcher = { 'a' : self.start_a, 'td' : self.start_td } + class SimpleHTMLParser(HTMLParser): + def __init__(self, results, url, *args): + HTMLParser.__init__(self) + self.td_counter = None + self.current_item = None + self.results = results + self.id = None + self.url = url + self.dispatcher = { 'a' : self.start_a, 'td' : self.start_td } - def handle_starttag(self, tag, attrs): - if tag in self.dispatcher: - self.dispatcher[tag](attrs) + def handle_starttag(self, tag, attrs): + if tag in self.dispatcher: + self.dispatcher[tag](attrs) - def start_a(self, attr): - params = dict(attr) - if re.match("/torrents/\d+.*", params['href']): - self.current_item = {} - self.current_item['desc_link'] = self.url+params['href'].strip() - elif 'torrentreactor.net/download.php' in params['href']: - self.td_counter = 0 - self.current_item['link'] = params['href'].strip() - self.current_item['name'] = parse.unquote_plus(params['href'].split('&')[1].split('name=')[1]) + def start_a(self, attr): + params = dict(attr) + if re.match("/torrents/\d+.*", params['href']): + self.current_item = {} + self.current_item['desc_link'] = self.url+params['href'].strip() + elif 'torrentreactor.net/download.php' in params['href']: + self.td_counter = 0 + self.current_item['link'] = params['href'].strip() + self.current_item['name'] = parse.unquote_plus(params['href'].split('&')[1].split('name=')[1]) - def handle_data(self, data): - if self.td_counter == 1: - if 'size' not in self.current_item: - self.current_item['size'] = '' - self.current_item['size']+= data.strip() - elif self.td_counter == 2: - if 'seeds' not in self.current_item: - self.current_item['seeds'] = '' - self.current_item['seeds']+= data.strip() - elif self.td_counter == 3: - if 'leech' not in self.current_item: - self.current_item['leech'] = '' - self.current_item['leech']+= data.strip() + def handle_data(self, data): + if self.td_counter == 1: + if 'size' not in self.current_item: + self.current_item['size'] = '' + self.current_item['size']+= data.strip() + elif self.td_counter == 2: + if 'seeds' not in self.current_item: + self.current_item['seeds'] = '' + self.current_item['seeds']+= data.strip() + elif self.td_counter == 3: + if 'leech' not in self.current_item: + self.current_item['leech'] = '' + self.current_item['leech']+= data.strip() - def start_td(self,attr): - if isinstance(self.td_counter,int): - self.td_counter += 1 - if self.td_counter > 3: - self.td_counter = None - # add item to results - if self.current_item: - self.current_item['engine_url'] = self.url - if not self.current_item['seeds'].isdigit(): - self.current_item['seeds'] = 0 - if not self.current_item['leech'].isdigit(): - self.current_item['leech'] = 0 - prettyPrinter(self.current_item) - self.has_results = True - self.results.append('a') + def start_td(self,attr): + if isinstance(self.td_counter,int): + self.td_counter += 1 + if self.td_counter > 3: + self.td_counter = None + # add item to results + if self.current_item: + self.current_item['engine_url'] = self.url + if not self.current_item['seeds'].isdigit(): + self.current_item['seeds'] = 0 + if not self.current_item['leech'].isdigit(): + self.current_item['leech'] = 0 + prettyPrinter(self.current_item) + self.has_results = True + self.results.append('a') - def __init__(self): - self.results = [] - self.parser = self.SimpleHTMLParser(self.results, self.url) + def search(self, what, cat='all'): + i = 0 + dat = '' + connection = http("www.torrentreactor.net") - def search(self, what, cat='all'): - i = 0 - dat = '' - while True and i<11: - results = [] - parser = self.SimpleHTMLParser(results, self.url) + while True and i<11: + results = [] + parser = self.SimpleHTMLParser(results, self.url) + query = '/torrents-search/%s/%d?sort=seeders.desc&type=all&period=none&categories=%s'%(what, (i*35), self.supported_categories[cat]) + connection.request("GET", query) + response = connection.getresponse() + if response.status != 200: + break - try: - dat = retrieve_url(self.url+'/torrent-search/%s/%s?sort=seeders.desc&type=all&period=none&categories=%s'%(what, (i*35), self.supported_categories[cat])) - except error.HTTPError: - break + dat = response.read().decode('utf-8') - parser.feed(dat) - parser.close() - if len(results) <= 0: - break - i += 1 + parser.feed(dat) + parser.close() + if len(results) <= 0: + break + i += 1 + + connection.close() diff --git a/src/searchengine/nova3/engines/versions.txt b/src/searchengine/nova3/engines/versions.txt index c0e097583..739171208 100644 --- a/src/searchengine/nova3/engines/versions.txt +++ b/src/searchengine/nova3/engines/versions.txt @@ -1,8 +1,9 @@ -torrentreactor: 1.33 -mininova: 1.51 -piratebay: 2.11 extratorrent: 1.2 +torrentreactor: 1.35 +mininova: 2.00 +piratebay: 2.11 +extratorrent: 2.0 kickasstorrents: 1.26 btdigg: 1.23 -legittorrents: 1.03 torrentz: 2.13 +legittorrents: 1.04