Improve RSS parsing logic.

This commit is contained in:
Vladimir Golovnev (Glassez) 2015-10-24 11:13:35 +03:00 committed by Vladimir Golovnev (qlassez)
parent 28ed981082
commit 6662081044
6 changed files with 263 additions and 354 deletions

View file

@ -1,6 +1,7 @@
/*
* Bittorrent Client using Qt4 and libtorrent.
* Copyright (C) 2012 Christophe Dumez
* Bittorrent Client using Qt and libtorrent.
* Copyright (C) 2015 Vladimir Golovnev <glassez@yandex.ru>
* Copyright (C) 2012 Christophe Dumez <chris@qbittorrent.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@ -30,63 +31,35 @@
#include <QDebug>
#include <QDateTime>
#include <QFile>
#include <QRegExp>
#include <QStringList>
#include <QVariant>
#include <QXmlStreamReader>
#include "base/utils/fs.h"
#include "rssparser.h"
namespace Rss
namespace
{
namespace Private
{
struct ParsingJob
{
QString feedUrl;
QByteArray feedData;
};
}
}
static const char shortDay[][4] = {
const char shortDay[][4] = {
"Mon", "Tue", "Wed",
"Thu", "Fri", "Sat",
"Sun"
};
static const char longDay[][10] = {
const char longDay[][10] = {
"Monday", "Tuesday", "Wednesday",
"Thursday", "Friday", "Saturday",
"Sunday"
};
static const char shortMonth[][4] = {
const char shortMonth[][4] = {
"Jan", "Feb", "Mar", "Apr",
"May", "Jun", "Jul", "Aug",
"Sep", "Oct", "Nov", "Dec"
};
using namespace Rss::Private;
Parser::Parser(QObject *parent)
: QThread(parent)
, m_running(true)
{
start();
}
Parser::~Parser()
{
m_running = false;
m_waitCondition.wakeOne();
wait();
}
// Ported to Qt from KDElibs4
QDateTime Parser::parseDate(const QString &string)
QDateTime parseDate(const QString &string)
{
const QString str = string.trimmed();
if (str.isEmpty())
@ -231,47 +204,53 @@ QDateTime Parser::parseDate(const QString &string)
return result;
}
void Parser::parseFeedData(const QString &feedUrl, const QByteArray &feedData)
{
qDebug() << Q_FUNC_INFO << feedUrl;
m_mutex.lock();
ParsingJob job = { feedUrl, feedData };
m_queue.enqueue(job);
// Wake up thread.
if (m_queue.count() == 1) {
qDebug() << Q_FUNC_INFO << "Waking up thread";
m_waitCondition.wakeOne();
}
m_mutex.unlock();
}
void Parser::clearFeedData(const QString &feedUrl)
{
m_mutex.lock();
m_lastBuildDates.remove(feedUrl);
m_mutex.unlock();
}
using namespace Rss::Private;
void Parser::run()
// read and create items from a rss document
void Parser::parse(const QByteArray &feedData)
{
while (m_running) {
m_mutex.lock();
if (!m_queue.empty()) {
ParsingJob job = m_queue.dequeue();
m_mutex.unlock();
parseFeed(job);
qDebug() << Q_FUNC_INFO;
QXmlStreamReader xml(feedData);
bool foundChannel = false;
while (xml.readNextStartElement()) {
if (xml.name() == "rss") {
// Find channels
while (xml.readNextStartElement()) {
if (xml.name() == "channel") {
parseRSSChannel(xml);
foundChannel = true;
break;
}
else {
qDebug() << Q_FUNC_INFO << "Thread is waiting.";
m_waitCondition.wait(&m_mutex);
qDebug() << Q_FUNC_INFO << "Thread woke up.";
m_mutex.unlock();
qDebug() << "Skip rss item: " << xml.name();
xml.skipCurrentElement();
}
}
break;
}
else if (xml.name() == "feed") { // Atom feed
parseAtomChannel(xml);
foundChannel = true;
break;
}
else {
qDebug() << "Skip root item: " << xml.name();
xml.skipCurrentElement();
}
}
void Parser::parseRssArticle(QXmlStreamReader &xml, const QString &feedUrl)
if (xml.hasError())
emit finished(xml.errorString());
else if (!foundChannel)
emit finished(tr("Invalid RSS feed."));
else
emit finished(QString());
}
void Parser::parseRssArticle(QXmlStreamReader &xml)
{
QVariantHash article;
@ -332,12 +311,12 @@ void Parser::parseRssArticle(QXmlStreamReader &xml, const QString &feedUrl)
}
}
emit newArticle(feedUrl, article);
emit newArticle(article);
}
void Parser::parseRSSChannel(QXmlStreamReader &xml, const QString &feedUrl)
void Parser::parseRSSChannel(QXmlStreamReader &xml)
{
qDebug() << Q_FUNC_INFO << feedUrl;
qDebug() << Q_FUNC_INFO;
Q_ASSERT(xml.isStartElement() && xml.name() == "channel");
while(!xml.atEnd()) {
@ -346,27 +325,26 @@ void Parser::parseRSSChannel(QXmlStreamReader &xml, const QString &feedUrl)
if (xml.isStartElement()) {
if (xml.name() == "title") {
QString title = xml.readElementText();
emit feedTitle(feedUrl, title);
emit feedTitle(title);
}
else if (xml.name() == "lastBuildDate") {
QString lastBuildDate = xml.readElementText();
if (!lastBuildDate.isEmpty()) {
QMutexLocker locker(&m_mutex);
if (m_lastBuildDates.value(feedUrl, "") == lastBuildDate) {
if (m_lastBuildDate == lastBuildDate) {
qDebug() << "The RSS feed has not changed since last time, aborting parsing.";
return;
}
m_lastBuildDates[feedUrl] = lastBuildDate;
m_lastBuildDate = lastBuildDate;
}
}
else if (xml.name() == "item") {
parseRssArticle(xml, feedUrl);
parseRssArticle(xml);
}
}
}
}
void Parser::parseAtomArticle(QXmlStreamReader &xml, const QString &feedUrl, const QString &baseUrl)
void Parser::parseAtomArticle(QXmlStreamReader &xml)
{
QVariantHash article;
bool doubleContent = false;
@ -392,7 +370,7 @@ void Parser::parseAtomArticle(QXmlStreamReader &xml, const QString &feedUrl, con
// Atom feeds can have relative links, work around this and
// take the stress of figuring article full URI from UI
// Assemble full URI
article["news_link"] = ( baseUrl.isEmpty() ? link : baseUrl + link );
article["news_link"] = ( m_baseUrl.isEmpty() ? link : m_baseUrl + link );
}
else if ((xml.name() == "summary") || (xml.name() == "content")){
@ -453,15 +431,15 @@ void Parser::parseAtomArticle(QXmlStreamReader &xml, const QString &feedUrl, con
}
}
emit newArticle(feedUrl, article);
emit newArticle(article);
}
void Parser::parseAtomChannel(QXmlStreamReader &xml, const QString &feedUrl)
void Parser::parseAtomChannel(QXmlStreamReader &xml)
{
qDebug() << Q_FUNC_INFO << feedUrl;
qDebug() << Q_FUNC_INFO;
Q_ASSERT(xml.isStartElement() && xml.name() == "feed");
QString baseURL = xml.attributes().value("xml:base").toString();
m_baseUrl = xml.attributes().value("xml:base").toString();
while (!xml.atEnd()) {
xml.readNext();
@ -469,74 +447,21 @@ void Parser::parseAtomChannel(QXmlStreamReader &xml, const QString &feedUrl)
if (xml.isStartElement()) {
if (xml.name() == "title") {
QString title = xml.readElementText();
emit feedTitle(feedUrl, title);
emit feedTitle(title);
}
else if (xml.name() == "updated") {
QString lastBuildDate = xml.readElementText();
if (!lastBuildDate.isEmpty()) {
QMutexLocker locker(&m_mutex);
if (m_lastBuildDates.value(feedUrl) == lastBuildDate) {
if (m_lastBuildDate == lastBuildDate) {
qDebug() << "The RSS feed has not changed since last time, aborting parsing.";
return;
}
m_lastBuildDates[feedUrl] = lastBuildDate;
m_lastBuildDate = lastBuildDate;
}
}
else if (xml.name() == "entry") {
parseAtomArticle(xml, feedUrl, baseURL);
parseAtomArticle(xml);
}
}
}
}
// read and create items from a rss document
void Parser::parseFeed(const ParsingJob &job)
{
qDebug() << Q_FUNC_INFO << job.feedUrl;
QXmlStreamReader xml(job.feedData);
bool foundChannel = false;
while (xml.readNextStartElement()) {
if (xml.name() == "rss") {
// Find channels
while (xml.readNextStartElement()) {
if (xml.name() == "channel") {
parseRSSChannel(xml, job.feedUrl);
foundChannel = true;
break;
}
else {
qDebug() << "Skip rss item: " << xml.name();
xml.skipCurrentElement();
}
}
break;
}
else if (xml.name() == "feed") { // Atom feed
parseAtomChannel(xml, job.feedUrl);
foundChannel = true;
break;
}
else {
qDebug() << "Skip root item: " << xml.name();
xml.skipCurrentElement();
}
}
if (xml.hasError()) {
reportFailure(job, xml.errorString());
return;
}
if (!foundChannel) {
reportFailure(job, tr("Invalid RSS feed at '%1'.").arg(job.feedUrl));
return;
}
emit feedParsingFinished(job.feedUrl, QString());
}
void Parser::reportFailure(const ParsingJob &job, const QString &error)
{
emit feedParsingFinished(job.feedUrl, error);
}

View file

@ -1,6 +1,7 @@
/*
* Bittorrent Client using Qt and libtorrent.
* Copyright (C) 2012 Christophe Dumez
* Copyright (C) 2015 Vladimir Golovnev <glassez@yandex.ru>
* Copyright (C) 2012 Christophe Dumez <chris@qbittorrent.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@ -31,12 +32,9 @@
#ifndef RSSPARSER_H
#define RSSPARSER_H
#include <QHash>
#include <QMutex>
#include <QQueue>
#include <QThread>
#include <QObject>
#include <QString>
#include <QVariantHash>
#include <QWaitCondition>
class QXmlStreamReader;
@ -44,41 +42,26 @@ namespace Rss
{
namespace Private
{
struct ParsingJob;
class Parser: public QThread
class Parser: public QObject
{
Q_OBJECT
public:
explicit Parser(QObject *parent = 0);
~Parser();
void parseFeedData(const QString &feedUrl, const QByteArray &feedData);
void clearFeedData(const QString &feedUrl);
public slots:
void parse(const QByteArray &feedData);
signals:
void newArticle(const QString &feedUrl, const QVariantHash &rssArticle);
void feedTitle(const QString &feedUrl, const QString &title);
void feedParsingFinished(const QString &feedUrl, const QString &error);
void newArticle(const QVariantHash &rssArticle);
void feedTitle(const QString &title);
void finished(const QString &error);
private:
void run() override;
void parseRssArticle(QXmlStreamReader &xml);
void parseRSSChannel(QXmlStreamReader &xml);
void parseAtomArticle(QXmlStreamReader &xml);
void parseAtomChannel(QXmlStreamReader &xml);
static QDateTime parseDate(const QString &string);
void parseRssArticle(QXmlStreamReader &xml, const QString &feedUrl);
void parseRSSChannel(QXmlStreamReader &xml, const QString &feedUrl);
void parseAtomArticle(QXmlStreamReader &xml, const QString &feedUrl, const QString &baseUrl);
void parseAtomChannel(QXmlStreamReader &xml, const QString &feedUrl);
void parseFeed(const ParsingJob &job);
void reportFailure(const ParsingJob &job, const QString &error);
bool m_running;
QMutex m_mutex;
QQueue<ParsingJob> m_queue;
QWaitCondition m_waitCondition;
QHash<QString/*feedUrl*/, QString/*lastBuildDate*/> m_lastBuildDates; // Optimization
QString m_lastBuildDate; // Optimization
QString m_baseUrl;
};
}
}

View file

@ -1,5 +1,6 @@
/*
* Bittorrent Client using Qt and libtorrent.
* Copyright (C) 2015 Vladimir Golovnev <glassez@yandex.ru>
* Copyright (C) 2010 Christophe Dumez <chris@qbittorrent.org>
* Copyright (C) 2010 Arnaud Demaiziere <arnaud@qbittorrent.org>
*
@ -67,11 +68,13 @@ Feed::Feed(const QString &url, Manager *manager)
, m_loading(false)
{
qDebug() << Q_FUNC_INFO << m_url;
m_parser = new Private::Parser;
m_parser->moveToThread(m_manager->workingThread());
connect(this, SIGNAL(destroyed()), m_parser, SLOT(deleteLater()));
// Listen for new RSS downloads
Private::Parser *const parser = m_manager->rssParser();
connect(parser, SIGNAL(feedTitle(QString,QString)), SLOT(handleFeedTitle(QString,QString)));
connect(parser, SIGNAL(newArticle(QString,QVariantHash)), SLOT(handleNewArticle(QString,QVariantHash)));
connect(parser, SIGNAL(feedParsingFinished(QString,QString)), SLOT(handleParsingFinished(QString,QString)));
connect(m_parser, SIGNAL(feedTitle(QString)), SLOT(handleFeedTitle(QString)));
connect(m_parser, SIGNAL(newArticle(QVariantHash)), SLOT(handleNewArticle(QVariantHash)));
connect(m_parser, SIGNAL(finished(QString)), SLOT(handleParsingFinished(QString)));
// Download the RSS Feed icon
Net::DownloadHandler *handler = Net::DownloadManager::instance()->downloadUrl(iconUrl(), true);
@ -87,7 +90,6 @@ Feed::~Feed()
{
if (!m_icon.startsWith(":/") && QFile::exists(m_icon))
Utils::Fs::forceRemove(m_icon);
m_manager->rssParser()->clearFeedData(m_url);
}
void Feed::saveItemsToDisk()
@ -320,7 +322,6 @@ QString Feed::iconUrl() const
void Feed::handleIconDownloadFinished(const QString &url, const QString &filePath)
{
Q_UNUSED(url);
m_icon = filePath;
qDebug() << Q_FUNC_INFO << "icon path:" << m_icon;
m_manager->forwardFeedIconChanged(m_url, m_icon);
@ -328,30 +329,31 @@ void Feed::handleIconDownloadFinished(const QString &url, const QString &filePat
void Feed::handleRssDownloadFinished(const QString &url, const QByteArray &data)
{
qDebug() << Q_FUNC_INFO << "Successfully downloaded RSS feed at" << url;
Q_UNUSED(url);
qDebug() << Q_FUNC_INFO << "Successfully downloaded RSS feed at" << m_url;
// Parse the download RSS
m_manager->rssParser()->parseFeedData(m_url, data);
QMetaObject::invokeMethod(m_parser, "parse", Qt::QueuedConnection, Q_ARG(QByteArray, data));
}
void Feed::handleRssDownloadFailed(const QString &url, const QString &error)
{
Q_UNUSED(url);
m_inErrorState = true;
m_loading = false;
m_manager->forwardFeedInfosChanged(m_url, displayName(), m_unreadCount);
qWarning() << "Failed to download RSS feed at" << url;
qWarning() << "Failed to download RSS feed at" << m_url;
qWarning() << "Reason:" << error;
}
void Feed::handleFeedTitle(const QString &feedUrl, const QString &title)
void Feed::handleFeedTitle(const QString &title)
{
if (feedUrl != m_url) return;
if (m_title == title) return;
m_title = title;
// Notify that we now have something better than a URL to display
if (m_alias.isEmpty())
m_manager->forwardFeedInfosChanged(feedUrl, title, m_unreadCount);
m_manager->forwardFeedInfosChanged(m_url, title, m_unreadCount);
}
void Feed::downloadArticleTorrentIfMatching(const ArticlePtr &article)
@ -406,13 +408,11 @@ void Feed::recheckRssItemsForDownload()
}
}
void Feed::handleNewArticle(const QString &feedUrl, const QVariantHash &articleData)
void Feed::handleNewArticle(const QVariantHash &articleData)
{
if (feedUrl != m_url) return;
ArticlePtr article = Article::fromHash(this, articleData);
if (article.isNull()) {
qDebug() << "Article hash corrupted or guid is uncomputable; feed url: " << feedUrl;
qDebug() << "Article hash corrupted or guid is uncomputable; feed url: " << m_url;
return;
}
Q_ASSERT(article);
@ -424,12 +424,10 @@ void Feed::handleNewArticle(const QString &feedUrl, const QVariantHash &articleD
//m_manager->forwardFeedContentChanged(m_url);
}
void Feed::handleParsingFinished(const QString &feedUrl, const QString &error)
void Feed::handleParsingFinished(const QString &error)
{
if (feedUrl != m_url) return;
if (!error.isEmpty()) {
qWarning() << "Failed to parse RSS feed at" << feedUrl;
qWarning() << "Failed to parse RSS feed at" << m_url;
qWarning() << "Reason:" << error;
}

View file

@ -1,5 +1,6 @@
/*
* Bittorrent Client using Qt and libtorrent.
* Copyright (C) 2015 Vladimir Golovnev <glassez@yandex.ru>
* Copyright (C) 2010 Christophe Dumez <chris@qbittorrent.org>
* Copyright (C) 2010 Arnaud Demaiziere <arnaud@qbittorrent.org>
*
@ -51,6 +52,11 @@ namespace Rss
typedef QSharedPointer<Feed> FeedPtr;
typedef QList<FeedPtr> FeedList;
namespace Private
{
class Parser;
}
bool articleDateRecentThan(const ArticlePtr &left, const ArticlePtr &right);
class Feed: public QObject, public File
@ -86,9 +92,9 @@ namespace Rss
void handleIconDownloadFinished(const QString &url, const QString &filePath);
void handleRssDownloadFinished(const QString &url, const QByteArray &data);
void handleRssDownloadFailed(const QString &url, const QString &error);
void handleFeedTitle(const QString &feedUrl, const QString &title);
void handleNewArticle(const QString &feedUrl, const QVariantHash &article);
void handleParsingFinished(const QString &feedUrl, const QString &error);
void handleFeedTitle(const QString &title);
void handleNewArticle(const QVariantHash &article);
void handleParsingFinished(const QString &error);
void handleArticleRead();
private:
@ -99,6 +105,7 @@ namespace Rss
private:
Manager *m_manager;
Private::Parser *m_parser;
ArticleHash m_articles;
ArticleList m_articlesByDate; // Articles sorted by date (more recent first)
QString m_title;

View file

@ -33,7 +33,6 @@
#include "base/logger.h"
#include "base/preferences.h"
#include "private/rssparser.h"
#include "rssfolder.h"
#include "rssfeed.h"
#include "rssarticle.h"
@ -48,9 +47,10 @@ using namespace Rss::Private;
Manager::Manager(QObject *parent)
: QObject(parent)
, m_downloadRules(new DownloadRuleList)
, m_rssParser(new Parser(this))
, m_rootFolder(new Folder)
, m_workingThread(new QThread(this))
{
m_workingThread->start();
connect(&m_refreshTimer, SIGNAL(timeout()), SLOT(refresh()));
m_refreshInterval = Preferences::instance()->getRSSRefreshInterval();
m_refreshTimer.start(m_refreshInterval * MSECS_PER_MIN);
@ -59,8 +59,9 @@ Manager::Manager(QObject *parent)
Manager::~Manager()
{
qDebug("Deleting RSSManager...");
m_workingThread->quit();
m_workingThread->wait();
delete m_downloadRules;
delete m_rssParser;
m_rootFolder->saveItemsToDisk();
saveStreamList();
m_rootFolder.clear();
@ -178,9 +179,9 @@ FolderPtr Manager::rootFolder() const
return m_rootFolder;
}
Parser *Manager::rssParser() const
QThread *Manager::workingThread() const
{
return m_rssParser;
return m_workingThread;
}
void Manager::refresh()

View file

@ -35,6 +35,7 @@
#include <QObject>
#include <QTimer>
#include <QSharedPointer>
#include <QThread>
namespace Rss
{
@ -48,11 +49,6 @@ namespace Rss
typedef QSharedPointer<Folder> FolderPtr;
typedef QSharedPointer<Feed> FeedPtr;
namespace Private
{
class Parser;
}
typedef QSharedPointer<Manager> ManagerPtr;
class Manager: public QObject
@ -65,8 +61,7 @@ namespace Rss
DownloadRuleList *downloadRules() const;
FolderPtr rootFolder() const;
Private::Parser *rssParser() const;
QThread *workingThread() const;
public slots:
void refresh();
@ -87,8 +82,8 @@ namespace Rss
QTimer m_refreshTimer;
uint m_refreshInterval;
DownloadRuleList *m_downloadRules;
Private::Parser *m_rssParser;
FolderPtr m_rootFolder;
QThread *m_workingThread;
};
}