qBittorrent/src/gui/rss/rssparser.cpp

/*
 * Bittorrent Client using Qt4 and libtorrent.
 * Copyright (C) 2012  Christophe Dumez
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 *
 * In addition, as a special exception, the copyright holders give permission to
 * link this program with the OpenSSL project's "OpenSSL" library (or with
 * modified versions of it that use the same license as the "OpenSSL" library),
 * and distribute the linked executables. You must obey the GNU General Public
 * License in all respects for all of the code used other than "OpenSSL".  If you
 * modify file(s), you may extend this exception to your version of the file(s),
 * but you are not obligated to do so. If you do not wish to do so, delete this
 * exception statement from your version.
 *
 * Contact : chris@qbittorrent.org
 */

#include "rssparser.h"
#include "downloadthread.h"
#include "fs_utils.h"
#include <QDebug>
#include <QFile>
#include <QRegExp>
#include <QStringList>
#include <QVariant>
#include <QTextDocument>

struct ParsingJob {
  QString feedUrl;
  QString filePath;
};

static const char shortDay[][4] = {
  "Mon", "Tue", "Wed",
  "Thu", "Fri", "Sat",
  "Sun"
};
static const char longDay[][10] = {
  "Monday", "Tuesday", "Wednesday",
  "Thursday", "Friday", "Saturday",
  "Sunday"
};
static const char shortMonth[][4] = {
  "Jan", "Feb", "Mar", "Apr",
  "May", "Jun", "Jul", "Aug",
  "Sep", "Oct", "Nov", "Dec"
};
static const char longMonth[][10] = {
  "January", "February", "March",
  "April", "May", "June",
  "July", "August", "September",
  "October", "November", "December"
};

// Ported to Qt4 from KDElibs4
QDateTime RssParser::parseDate(const QString &string) {
  const QString str = string.trimmed();
  if (str.isEmpty())
    return QDateTime::currentDateTime();

  int nyear  = 6;   // indexes within string to values
  int nmonth = 4;
  int nday   = 2;
  int nwday  = 1;
  int nhour  = 7;
  int nmin   = 8;
  int nsec   = 9;
  // Also accept obsolete form "Weekday, DD-Mon-YY HH:MM:SS ±hhmm"
  QRegExp rx("^(?:([A-Z][a-z]+),\\s*)?(\\d{1,2})(\\s+|-)([^-\\s]+)(\\s+|-)(\\d{2,4})\\s+(\\d\\d):(\\d\\d)(?::(\\d\\d))?\\s+(\\S+)$");
  QStringList parts;
  if (!str.indexOf(rx)) {
    // Check that if date has '-' separators, both separators are '-'.
    parts = rx.capturedTexts();
    bool h1 = (parts[3] == QLatin1String("-"));
    bool h2 = (parts[5] == QLatin1String("-"));
    if (h1 != h2)
      return QDateTime::currentDateTime();
  } else {
    // Check for the obsolete form "Wdy Mon DD HH:MM:SS YYYY"
    rx = QRegExp("^([A-Z][a-z]+)\\s+(\\S+)\\s+(\\d\\d)\\s+(\\d\\d):(\\d\\d):(\\d\\d)\\s+(\\d\\d\\d\\d)$");
    if (str.indexOf(rx))
      return QDateTime::currentDateTime();
    nyear  = 7;
    nmonth = 2;
    nday   = 3;
    nwday  = 1;
    nhour  = 4;
    nmin   = 5;
    nsec   = 6;
    parts = rx.capturedTexts();
  }
  bool ok[4];
  const int day    = parts[nday].toInt(&ok[0]);
  int year   = parts[nyear].toInt(&ok[1]);
  const int hour   = parts[nhour].toInt(&ok[2]);
  const int minute = parts[nmin].toInt(&ok[3]);
  if (!ok[0] || !ok[1] || !ok[2] || !ok[3])
    return QDateTime::currentDateTime();
  int second = 0;
  if (!parts[nsec].isEmpty()) {
    second = parts[nsec].toInt(&ok[0]);
    if (!ok[0])
      return QDateTime::currentDateTime();
  }
  bool leapSecond = (second == 60);
  if (leapSecond)
    second = 59;   // apparently a leap second - validate below, once time zone is known
  int month = 0;
  for ( ;  month < 12  &&  parts[nmonth] != shortMonth[month];  ++month) ;
  int dayOfWeek = -1;
  if (!parts[nwday].isEmpty()) {
    // Look up the weekday name
    while (++dayOfWeek < 7  &&  shortDay[dayOfWeek] != parts[nwday]) ;
    if (dayOfWeek >= 7)
      for (dayOfWeek = 0;  dayOfWeek < 7  &&  longDay[dayOfWeek] != parts[nwday];  ++dayOfWeek) ;
  }
  //       if (month >= 12 || dayOfWeek >= 7
  //       ||  (dayOfWeek < 0  &&  format == RFCDateDay))
  //         return QDateTime;
  int i = parts[nyear].size();
  if (i < 4) {
    // It's an obsolete year specification with less than 4 digits
    year += (i == 2  &&  year < 50) ? 2000: 1900;
  }

  // Parse the UTC offset part
  int offset = 0;           // set default to '-0000'
  bool negOffset = false;
  if (parts.count() > 10) {
    rx = QRegExp("^([+-])(\\d\\d)(\\d\\d)$");
    if (!parts[10].indexOf(rx)) {
      // It's a UTC offset ±hhmm
      parts = rx.capturedTexts();
      offset = parts[2].toInt(&ok[0]) * 3600;
      int offsetMin = parts[3].toInt(&ok[1]);
      if (!ok[0] || !ok[1] || offsetMin > 59)
        return QDateTime();
      offset += offsetMin * 60;
      negOffset = (parts[1] == QLatin1String("-"));
      if (negOffset)
        offset = -offset;
    } else {
      // Check for an obsolete time zone name
      QByteArray zone = parts[10].toLatin1();
      if (zone.length() == 1  &&  isalpha(zone[0])  &&  toupper(zone[0]) != 'J')
        negOffset = true;    // military zone: RFC 2822 treats as '-0000'
      else if (zone != "UT" && zone != "GMT") {    // treated as '+0000'
        offset = (zone == "EDT")                  ? -4*3600
                                                  : (zone == "EST" || zone == "CDT") ? -5*3600
                                                                                     : (zone == "CST" || zone == "MDT") ? -6*3600
                                                                                                                        : (zone == "MST" || zone == "PDT") ? -7*3600
                                                                                                                                                           : (zone == "PST")                  ? -8*3600
                                                                                                                                                                                              : 0;
        if (!offset) {
          // Check for any other alphabetic time zone
          bool nonalpha = false;
          for (int i = 0, end = zone.size();  i < end && !nonalpha;  ++i)
            nonalpha = !isalpha(zone[i]);
          if (nonalpha)
            return QDateTime();
          // TODO: Attempt to recognize the time zone abbreviation?
          negOffset = true;    // unknown time zone: RFC 2822 treats as '-0000'
        }
      }
    }
  }
  QDate qdate(year, month+1, day);   // convert date, and check for out-of-range
  if (!qdate.isValid())
    return QDateTime::currentDateTime();
  QTime qTime(hour, minute, second);

  QDateTime result(qdate, qTime, Qt::UTC);
  if (offset)
    result = result.addSecs(-offset);
  if (!result.isValid())
    return QDateTime::currentDateTime();    // invalid date/time

  if (leapSecond) {
    // Validate a leap second time. Leap seconds are inserted after 23:59:59 UTC.
    // Convert the time to UTC and check that it is 00:00:00.
    if ((hour*3600 + minute*60 + 60 - offset + 86400*5) % 86400)   // (max abs(offset) is 100 hours)
      return QDateTime::currentDateTime();    // the time isn't the last second of the day
  }
  return result;
}

RssParser::RssParser(QObject *parent) :
  QThread(parent), m_running(true)
{
  start();
}

RssParser::~RssParser()
{
  m_running = false;
  m_waitCondition.wakeOne();
  wait();
}

void RssParser::parseRssFile(const QString& feedUrl, const QString& filePath)
{
  qDebug() << Q_FUNC_INFO << feedUrl << filePath;
  m_mutex.lock();
  ParsingJob job = { feedUrl, fsutils::fromNativePath(filePath) };
  m_queue.enqueue(job);
  // Wake up thread.
  if (m_queue.count() == 1) {
    qDebug() << Q_FUNC_INFO << "Waking up thread";
    m_waitCondition.wakeOne();
  }
  m_mutex.unlock();
}

void RssParser::clearFeedData(const QString &feedUrl)
{
  m_mutex.lock();
  m_lastBuildDates.remove(feedUrl);
  m_mutex.unlock();
}

void RssParser::run()
{
  while (m_running) {
    m_mutex.lock();
    if (!m_queue.empty()) {
      ParsingJob job = m_queue.dequeue();
      m_mutex.unlock();
      parseFeed(job);
    } else {
      qDebug() << Q_FUNC_INFO << "Thread is waiting.";
      m_waitCondition.wait(&m_mutex);
      qDebug() << Q_FUNC_INFO << "Thread woke up.";
      m_mutex.unlock();
    }
  }
}

void RssParser::parseRssArticle(QXmlStreamReader& xml, const QString& feedUrl)
{
  QVariantHash article;

  while(!xml.atEnd()) {
    xml.readNext();

    if(xml.isEndElement() && xml.name() == "item")
      break;

    if (xml.isStartElement()) {
      if (xml.name() == "title")
        article["title"] = xml.readElementText();
      else if (xml.name() == "enclosure") {
        if (xml.attributes().value("type") == "application/x-bittorrent")
          article["torrent_url"] = xml.attributes().value("url").toString();
      }
      else if (xml.name() == "link")
        article["news_link"] = xml.readElementText();
      else if (xml.name() == "description")
        article["description"] = xml.readElementText();
      else if (xml.name() == "pubDate")
        article["date"] = parseDate(xml.readElementText());
      else if (xml.name() == "author")
        article["author"] = xml.readElementText();
      else if (xml.name() == "guid")
        article["id"] = xml.readElementText();
    }
  }

  if (!article.contains("id")) {
    // Item does not have a guid, fall back to some other identifier
    const QString link = article.value("news_link").toString();
    if (!link.isEmpty())
      article["id"] = link;
    else {
      const QString title = article.value("title").toString();
      if (!title.isEmpty())
        article["id"] = title;
      else {
        qWarning() << "Item has no guid, link or title, ignoring it...";
        return;
      }
    }
  }

  emit newArticle(feedUrl, article);
}

void RssParser::parseRSSChannel(QXmlStreamReader& xml, const QString& feedUrl)
{
  qDebug() << Q_FUNC_INFO << feedUrl;
  Q_ASSERT(xml.isStartElement() && xml.name() == "channel");

  while(!xml.atEnd()) {
    xml.readNext();

    if (xml.isStartElement()) {
      if (xml.name() == "title") {
        QString title = xml.readElementText();
        emit feedTitle(feedUrl, title);
      }
      else if (xml.name() == "lastBuildDate") {
        QString lastBuildDate = xml.readElementText();
        if (!lastBuildDate.isEmpty()) {
          QMutexLocker locker(&m_mutex);
          if (m_lastBuildDates.value(feedUrl, "") == lastBuildDate) {
            qDebug() << "The RSS feed has not changed since last time, aborting parsing.";
            return;
          }
          m_lastBuildDates[feedUrl] = lastBuildDate;
        }
      }
      else if (xml.name() == "item") {
        parseRssArticle(xml, feedUrl);
      }
    }
  }
}

void RssParser::parseAtomArticle(QXmlStreamReader& xml, const QString& feedUrl, const QString& baseUrl)
{
  QVariantHash article;
  bool double_content = false;

  while(!xml.atEnd()) {
    xml.readNext();

    if(xml.isEndElement() && xml.name() == "entry")
      break;

    if (xml.isStartElement()) {
      if (xml.name() == "title") {
        // Workaround for CDATA (QString cannot parse html escapes on it's own)
        QTextDocument doc;
        doc.setHtml(xml.readElementText());
        article["title"] = doc.toPlainText();
      }
      else if (xml.name() == "link") {
        QString theLink = ( xml.attributes().isEmpty() ?
                              xml.readElementText() :
                              xml.attributes().value("href").toString() );

        // Atom feeds can have relative links, work around this and
        // take the stress of figuring article full URI from UI

        // Assemble full URI
        article["news_link"] = ( baseUrl.isEmpty() ?
                                   theLink :
                                   baseUrl + theLink );
      }
      else if (xml.name() == "summary" || xml.name() == "content"){
        if(double_content) { // Duplicate content -> ignore
          xml.readNext();

          while(xml.name() != "summary" && xml.name() != "content")
            xml.readNext();

          continue;
        }

        // Try to also parse broken articles, which don't use html '&' escapes
        // Actually works great for non-broken content too
        QString feedText = xml.readElementText(QXmlStreamReader::IncludeChildElements);
        if (!feedText.isEmpty())
          article["description"] = feedText;

        double_content = true;
      }
      else if (xml.name() == "updated"){
        // ATOM uses standard compliant date, don't do fancy stuff
        QDateTime articleDate = QDateTime::fromString(xml.readElementText(), Qt::ISODate);
        article["date"] = ( articleDate.isValid() ?
                              articleDate :
                              QDateTime::currentDateTime() );
      }
      else if (xml.name() == "author") {
        xml.readNext();
        while(xml.name() != "author") {
          if(xml.name() == "name")
            article["author"] = xml.readElementText();
          xml.readNext();
        }
      }
      else if (xml.name() == "id")
        article["id"] = xml.readElementText();
    }
  }

  if (!article.contains("id")) {
    // Item does not have a guid, fall back to some other identifier
    const QString link = article.value("news_link").toString();
    if (!link.isEmpty())
      article["id"] = link;
    else {
      const QString title = article.value("title").toString();
      if (!title.isEmpty())
        article["id"] = title;
      else {
        qWarning() << "Item has no guid, link or title, ignoring it...";
        return;
      }
    }
  }

  emit newArticle(feedUrl, article);
}

void RssParser::parseAtomChannel(QXmlStreamReader& xml, const QString& feedUrl)
{
  qDebug() << Q_FUNC_INFO << feedUrl;
  Q_ASSERT(xml.isStartElement() && xml.name() == "feed");

  QString baseURL = xml.attributes().value("xml:base").toString();

  while(!xml.atEnd()) {
    xml.readNext();

    if (xml.isStartElement()) {
      if (xml.name() == "title") {
        QString title = xml.readElementText();
        emit feedTitle(feedUrl, title);
      }
      else if (xml.name() == "updated") {
        QString lastBuildDate = xml.readElementText();
        if (!lastBuildDate.isEmpty()) {
          QMutexLocker locker(&m_mutex);
          if (m_lastBuildDates.value(feedUrl) == lastBuildDate) {
            qDebug() << "The RSS feed has not changed since last time, aborting parsing.";
            return;
          }
          m_lastBuildDates[feedUrl] = lastBuildDate;
        }
      }
      else if (xml.name() == "entry") {
        parseAtomArticle(xml, feedUrl, baseURL);
      }
    }
  }
}

// read and create items from a rss document
void RssParser::parseFeed(const ParsingJob& job)
{
  qDebug() << Q_FUNC_INFO << job.feedUrl << job.filePath;
  QFile fileRss(job.filePath);
  if (!fileRss.open(QIODevice::ReadOnly | QIODevice::Text)) {
    reportFailure(job, tr("Failed to open downloaded RSS file."));
    return;
  }
  QXmlStreamReader xml(&fileRss);

  bool found_channel = false;
  while (xml.readNextStartElement()) {
    if (xml.name() == "rss") {
      // Find channels
      while (xml.readNextStartElement()) {
        if (xml.name() == "channel") {
          parseRSSChannel(xml, job.feedUrl);
          found_channel = true;
          break;
        } else {
          qDebug() << "Skip rss item: " << xml.name();
          xml.skipCurrentElement();
        }
      }
      break;
    }
    else if (xml.name() == "feed") { // Atom feed
      parseAtomChannel(xml, job.feedUrl);
      found_channel = true;
      break;
    } else {
      qDebug() << "Skip root item: " << xml.name();
      xml.skipCurrentElement();
    }
  }

  if (xml.hasError()) {
    reportFailure(job, xml.errorString());
    return;
  }

  if (!found_channel) {
    reportFailure(job, tr("Invalid RSS feed at %1.").arg(job.feedUrl));
    return;
  }

  // Clean up
  fileRss.close();
  emit feedParsingFinished(job.feedUrl, QString());
  fsutils::forceRemove(job.filePath);
}

void RssParser::reportFailure(const ParsingJob& job, const QString& error)
{
  emit feedParsingFinished(job.feedUrl, error);
  fsutils::forceRemove(job.filePath);
}