# Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of the author nor the names of its contributors may be # used to endorse or promote products derived from this software without # specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. #VERSION: 1.2 # Author: # Christophe DUMEZ (chris@qbittorrent.org) import re, htmlentitydefs import tempfile import os import StringIO, gzip, urllib2 def htmlentitydecode(s): # First convert alpha entities (such as é) # (Inspired from http://mail.python.org/pipermail/python-list/2007-June/443813.html) def entity2char(m): entity = m.group(1) if entity in htmlentitydefs.name2codepoint: return unichr(htmlentitydefs.name2codepoint[entity]) return u" " # Unknown entity: We replace with a space. t = re.sub(u'&(%s);' % u'|'.join(htmlentitydefs.name2codepoint), entity2char, s) # Then convert numerical entities (such as é) t = re.sub(u'&#(\d+);', lambda x: unichr(int(x.group(1))), t) # Then convert hexa entities (such as é) return re.sub(u'&#x(\w+);', lambda x: unichr(int(x.group(1),16)), t) def retrieve_url(url): """ Return the content of the url page as a string """ req = urllib2.Request(url) response = urllib2.urlopen(req) dat = response.read() info = response.info() charset = 'utf-8' try: ignore, charset = info['Content-Type'].split('charset=') except: pass dat = dat.decode(charset, 'replace') dat = htmlentitydecode(dat) return dat.encode('utf-8', 'replace') def download_file(url, referer=None): """ Download file at url and write it to a file, return the path to the file and the url """ file, path = tempfile.mkstemp() file = os.fdopen(file, "w") # Download url req = urllib2.Request(url) if referer is not None: req.add_header('referer', referer) response = urllib2.urlopen(req) dat = response.read() # Check if data is gzip encoded response_info = response.info() content_encoding = response_info.get('Content-Encoding') if content_encoding is not None and 'gzip' in content_encoding: # Data is gzip encoded, decode it compressedstream = StringIO.StringIO(dat) gzipper = gzip.GzipFile(fileobj=compressedstream) extracted_data = gzipper.read() dat = extracted_data # Write it to a file file.write(dat) file.close() # return file path return path+" "+url