Excludes: Translate full-path patterns to regex

Improves full matches by more than an order of magnitude
and also improves speed of traversal matches by roughly 20%,
judging by the check_csync_exclude performance test.
This commit is contained in:
Christian Kamm 2017-11-29 12:13:19 +01:00 committed by ckamm
parent 1c3d5ab158
commit e5c780ee19
3 changed files with 244 additions and 90 deletions

View file

@ -463,18 +463,13 @@ void ExcludedFiles::clearManualExcludes()
bool ExcludedFiles::reloadExcludeFiles()
{
_allExcludes.clear();
_bnameRegexFileDir = QRegularExpression();
bool success = true;
foreach (const QString &file, _excludeFiles) {
if (csync_exclude_load(file.toUtf8(), &_allExcludes) < 0)
success = false;
}
_allExcludes.append(_manualExcludes);
prepare();
return success;
}
@ -519,37 +514,46 @@ bool ExcludedFiles::isExcluded(
CSYNC_EXCLUDE_TYPE ExcludedFiles::traversalPatternMatch(const char *path, int filetype) const
{
CSYNC_EXCLUDE_TYPE match = CSYNC_NOT_EXCLUDED;
/* Check only static patterns and only with the reduced list which is empty usually */
match = _csync_excluded_common(_nonRegexExcludes, path, filetype, false);
if (match != CSYNC_NOT_EXCLUDED) {
auto match = _csync_excluded_common(_nonRegexExcludes, path, filetype, false);
if (match != CSYNC_NOT_EXCLUDED)
return match;
}
if (_allExcludes.isEmpty())
return CSYNC_NOT_EXCLUDED;
if (!_allExcludes.isEmpty()) {
/* Now check with our optimized regexps */
const char *bname = NULL;
/* split up the path */
bname = strrchr(path, '/');
if (bname) {
bname += 1; // don't include the /
} else {
bname = path;
}
QString p = QString::fromUtf8(bname);
QRegularExpressionMatch m;
if (filetype == CSYNC_FTW_TYPE_DIR) {
m = _bnameRegexDir.match(p);
} else {
m = _bnameRegexFileDir.match(p);
}
if (m.hasMatch()) {
if (!m.captured(1).isEmpty()) {
match = CSYNC_FILE_EXCLUDE_LIST;
} else if (!m.captured(2).isEmpty()) {
match = CSYNC_FILE_EXCLUDE_AND_REMOVE;
}
// Check the bname part of the path to see whether the full
// regex should be run.
auto bname = strrchr(path, '/');
if (bname) {
bname += 1; // don't include the /
} else {
bname = path;
}
QString bnameStr = QString::fromUtf8(bname);
QRegularExpressionMatch m;
if (filetype == CSYNC_FTW_TYPE_DIR) {
m = _bnameActivationRegexDir.match(bnameStr);
} else {
m = _bnameActivationRegexFile.match(bnameStr);
}
if (!m.hasMatch())
return match;
// Now run the full match
QString pathStr = QString::fromUtf8(path);
if (filetype == CSYNC_FTW_TYPE_DIR) {
m = _fullRegexDir.match(pathStr);
} else {
m = _fullRegexFile.match(pathStr);
}
if (m.hasMatch()) {
if (!m.captured(1).isEmpty()) {
match = CSYNC_FILE_EXCLUDE_LIST;
} else if (!m.captured(2).isEmpty()) {
match = CSYNC_FILE_EXCLUDE_AND_REMOVE;
}
}
return match;
@ -557,8 +561,28 @@ CSYNC_EXCLUDE_TYPE ExcludedFiles::traversalPatternMatch(const char *path, int fi
CSYNC_EXCLUDE_TYPE ExcludedFiles::fullPatternMatch(const char *path, int filetype) const
{
// bname check must also apply to all path components
return _csync_excluded_common(_allExcludes, path, filetype, true);
/* Check only static patterns and only with the reduced list which is empty usually */
auto match = _csync_excluded_common(_nonRegexExcludes, path, filetype, true);
if (match != CSYNC_NOT_EXCLUDED)
return match;
if (_allExcludes.isEmpty())
return CSYNC_NOT_EXCLUDED;
QString p = QString::fromUtf8(path);
QRegularExpressionMatch m;
if (filetype == CSYNC_FTW_TYPE_DIR) {
m = _fullRegexDir.match(p);
} else {
m = _fullRegexFile.match(p);
}
if (m.hasMatch()) {
if (!m.captured(1).isEmpty()) {
return CSYNC_FILE_EXCLUDE_LIST;
} else if (!m.captured(2).isEmpty()) {
return CSYNC_FILE_EXCLUDE_AND_REMOVE;
}
}
return CSYNC_NOT_EXCLUDED;
}
auto ExcludedFiles::csyncTraversalMatchFun() const
@ -567,10 +591,9 @@ auto ExcludedFiles::csyncTraversalMatchFun() const
return [this](const char *path, int filetype) { return this->traversalPatternMatch(path, filetype); };
}
/* Only for bnames (not paths) */
static QString convertToBnameRegexpSyntax(QString exclude)
static QString convertToRegexpSyntax(QString exclude)
{
QString s = QRegularExpression::escape(exclude).replace("\\*", ".*").replace("\\?", ".");
QString s = QRegularExpression::escape(exclude).replace("\\*", "[^/]*").replace("\\?", "[^/]");
return s;
}
@ -578,15 +601,45 @@ void ExcludedFiles::prepare()
{
_nonRegexExcludes.clear();
// Start out with regexes that would match nothing
QString patternFileDirKeep = "a^";
QString patternFileDirRemove = "a^";
QString patternDirKeep = "a^";
QString patternDirRemove = "a^";
auto regexAppend = [](QString &pattern, QString v) {
// Build regular expressions for the different cases.
//
// To compose the _bnameActivationRegex and _fullRegex patterns we
// collect several subgroups of patterns here.
//
// * The "full" group will contain all patterns that contain a non-trailing
// slash. They only make sense in the fullRegex.
// * The "bname" group contains all patterns without a non-trailing slash.
// These need separate handling in the _fullRegex (slash-containing
// patterns must be anchored to the front, these don't need it)
// * The "bnameTrigger" group contains the bname part of all patterns in the
// "full" group. These and the "bname" group become _bnameActivationRegex.
//
// To complicate matters, the exclude patterns have two binary attributes
// meaning we'll end up with 4 variants:
// * "]" patterns mean "EXCLUDE_AND_REMOVE", they get collected in the
// pattern strings ending in "Remove". The others go to "Keep".
// * trailing-slash patterns match directories only. They get collected
// in the pattern strings saying "Dir", the others go into "FileDir"
// because they match files and directories.
QString fullFileDirKeep;
QString fullFileDirRemove;
QString fullDirKeep;
QString fullDirRemove;
QString bnameFileDirKeep;
QString bnameFileDirRemove;
QString bnameDirKeep;
QString bnameDirRemove;
QString bnameTriggerFileDir;
QString bnameTriggerDir;
auto regexAppend = [](QString &fileDirPattern, QString &dirPattern, const QString &appendMe, bool dirOnly) {
QString &pattern = dirOnly ? dirPattern : fileDirPattern;
if (!pattern.isEmpty())
pattern.append("|");
pattern.append(v);
pattern.append(appendMe);
};
for (auto exclude : _allExcludes) {
@ -603,35 +656,99 @@ void ExcludedFiles::prepare()
if (removeExcluded)
exclude = exclude.mid(1);
/* If an exclude entry contains some fnmatch-ish characters, we use the C-style codepath without QRegularEpression */
if (strchr(exclude, '/') || strchr(exclude, '[') || strchr(exclude, '{') || strchr(exclude, '\\')) {
bool fullPath = exclude.contains('/');
/* If an exclude entry contains some fnmatch-ish characters that
* we can't yet translate to regular expressions, we use the C-style
* fnmatch based codepath instead */
if (strchr(exclude, '[') || strchr(exclude, '{') || strchr(exclude, '\\')) {
_nonRegexExcludes.append(exclude);
continue;
}
/* Use QRegularExpression, append to the right pattern */
auto &patternFileDir = removeExcluded ? patternFileDirRemove : patternFileDirKeep;
auto &patternDir = removeExcluded ? patternDirRemove : patternDirKeep;
auto &bnameFileDir = removeExcluded ? bnameFileDirRemove : bnameFileDirKeep;
auto &bnameDir = removeExcluded ? bnameDirRemove : bnameDirKeep;
auto &fullFileDir = removeExcluded ? fullFileDirRemove : fullFileDirKeep;
auto &fullDir = removeExcluded ? fullDirRemove : fullDirKeep;
auto regexExclude = convertToBnameRegexpSyntax(QString::fromUtf8(exclude));
// patterns always match against directories
regexAppend(patternDir, regexExclude);
// but some don't match against files
if (!matchDirOnly)
regexAppend(patternFileDir, regexExclude);
auto regexExclude = convertToRegexpSyntax(QString::fromUtf8(exclude));
if (!fullPath) {
regexAppend(bnameFileDir, bnameDir, regexExclude, matchDirOnly);
} else {
regexAppend(fullFileDir, fullDir, regexExclude, matchDirOnly);
// for activation, trigger on the 'bname' part of the full pattern
auto bnameExclude = exclude.mid(exclude.lastIndexOf('/') + 1);
auto regexBname = convertToRegexpSyntax(bnameExclude);
regexAppend(bnameTriggerFileDir, bnameTriggerDir, regexBname, matchDirOnly);
}
}
_bnameRegexFileDir.setPattern(
"^(" + patternFileDirKeep + ")$|^(" + patternFileDirRemove + ")$");
_bnameRegexDir.setPattern(
"^(" + patternDirKeep + ")$|^(" + patternDirRemove + ")$");
// The empty pattern would match everything - change it to match-nothing
auto emptyMatchNothing = [](QString &pattern) {
if (pattern.isEmpty())
pattern = "a^";
};
emptyMatchNothing(fullFileDirKeep);
emptyMatchNothing(fullFileDirRemove);
emptyMatchNothing(fullDirKeep);
emptyMatchNothing(fullDirRemove);
QRegularExpression::PatternOptions patternOptions = QRegularExpression::OptimizeOnFirstUsageOption;
emptyMatchNothing(bnameFileDirKeep);
emptyMatchNothing(bnameFileDirRemove);
emptyMatchNothing(bnameDirKeep);
emptyMatchNothing(bnameDirRemove);
emptyMatchNothing(bnameTriggerFileDir);
emptyMatchNothing(bnameTriggerDir);
// The bname activation regexe is applied to the bname only, so must be
// anchored in the beginning and in the end. It has the explicit triggers
// plus the bname-only patterns. Here we don't care about the remove/keep
// distinction.
_bnameActivationRegexFile.setPattern(
"^(?:" + bnameFileDirKeep + "|" + bnameFileDirRemove + "|" + bnameTriggerFileDir + ")$");
_bnameActivationRegexDir.setPattern(
"^(?:" + bnameFileDirKeep + "|" + bnameFileDirRemove
+ "|" + bnameDirKeep + "|" + bnameFileDirRemove
+ "|" + bnameTriggerFileDir + "|" + bnameTriggerDir + ")$");
// The full regex has two captures, it's basic form is "(...)|(...)". The first
// capture has the keep/exclude-only patterns, the second the remove/exclude-and-remove
// patterns.
_fullRegexFile.setPattern(
QLatin1String("(")
// Full patterns are anchored to the beginning
+ "^(?:" + fullFileDirKeep + ")(?:$|/)" + "|"
// Simple bname patterns can be any path component
+ "(?:^|/)(?:" + bnameFileDirKeep + ")(?:$|/)" + "|"
// When checking a file for exclusion we must check all parent paths
// against the dir-only patterns as well.
+ "(?:^|/)(?:" + bnameDirKeep + ")/"
+ ")|("
+ "^(?:" + fullFileDirRemove + ")(?:$|/)" + "|"
+ "(?:^|/)(?:" + bnameFileDirRemove + ")(?:$|/)" + "|"
+ "(?:^|/)(?:" + bnameDirRemove + ")/"
+ ")");
_fullRegexDir.setPattern(
QLatin1String("(")
+ "^(?:" + fullFileDirKeep + "|" + fullDirKeep + ")(?:$|/)" + "|"
+ "(?:^|/)(?:" + bnameFileDirKeep + "|" + bnameDirKeep + ")(?:$|/)"
+ ")|("
+ "^(?:" + fullFileDirRemove + "|" + fullDirRemove + ")(?:$|/)" + "|"
+ "(?:^|/)(?:" + bnameFileDirRemove + "|" + bnameDirRemove + ")(?:$|/)"
+ ")");
QRegularExpression::PatternOptions patternOptions = QRegularExpression::NoPatternOption;
if (OCC::Utility::fsCasePreserving())
patternOptions |= QRegularExpression::CaseInsensitiveOption;
_bnameRegexFileDir.setPatternOptions(patternOptions);
_bnameRegexFileDir.optimize();
_bnameRegexDir.setPatternOptions(patternOptions);
_bnameRegexDir.optimize();
_bnameActivationRegexFile.setPatternOptions(patternOptions);
_bnameActivationRegexFile.optimize();
_bnameActivationRegexDir.setPatternOptions(patternOptions);
_bnameActivationRegexDir.optimize();
_fullRegexFile.setPatternOptions(patternOptions);
_fullRegexFile.optimize();
_fullRegexDir.setPatternOptions(patternOptions);
_fullRegexDir.optimize();
}

View file

@ -45,6 +45,8 @@ enum csync_exclude_type_e {
};
typedef enum csync_exclude_type_e CSYNC_EXCLUDE_TYPE;
class ExcludedFilesTest;
/**
* Manages file/directory exclusion.
*
@ -114,11 +116,7 @@ public slots:
*/
bool reloadExcludeFiles();
#ifdef CSYNC_TEST
public:
#else
private:
#endif
/**
* @brief Match the exclude pattern against the full path.
*
@ -147,8 +145,34 @@ private:
CSYNC_EXCLUDE_TYPE traversalPatternMatch(const char *path, int filetype) const;
/**
* Generate an optimized _regex for many of the patterns. The remaining
* patterns are put into _nonRegexExcludes.
* Generate an optimized regular expression for many of the patterns.
*
* The remaining patterns are put into _nonRegexExcludes.
*
* The optimization works in two steps: First, all supported patterns are put
* into _fullRegexFile/_fullRegexDir. These regexes can be applied to the full
* path to determine whether it is excluded or not.
*
* The second is a performance optimization. The particularly common use
* case for excludes during a sync run is "traversal": Instead of checking
* the full path every time, we check each parent path with the traversal
* function incrementally.
*
* Example: When the sync run eventually arrives at "a/b/c it can assume
* that the traversal matching has already been run on "a", "a/b"
* and just needs to run the traversal matcher on "a/b/c".
*
* The full matcher is equivalent to or-combining the traversal match results
* of all parent paths:
* full("a/b/c/d") == traversal("a") || traversal("a/b") || traversal("a/b/c")
*
* The traversal matcher can be extremely fast because it has a fast early-out
* case: It checks the bname part of the path against _bnameActivationRegex
* and only runs the full regex if the bname activation was triggered.
*
* Note: The traversal matcher will return not-excluded on some paths that the
* full matcher would exclude. Example: "b" is excluded. traversal("b/c")
* returns not-excluded because "c" isn't a bname activation pattern.
*/
void prepare();
@ -165,8 +189,12 @@ private:
QList<QByteArray> _nonRegexExcludes;
/// see prepare()
QRegularExpression _bnameRegexFileDir;
QRegularExpression _bnameRegexDir;
QRegularExpression _bnameActivationRegexFile;
QRegularExpression _bnameActivationRegexDir;
QRegularExpression _fullRegexFile;
QRegularExpression _fullRegexDir;
friend class ExcludedFilesTest;
};
#endif /* _CSYNC_EXCLUDE_H */

View file

@ -31,6 +31,10 @@
ExcludedFiles *excludedFiles = nullptr;
class ExcludedFilesTest
{
public:
static int setup(void **state) {
CSYNC *csync;
@ -84,22 +88,22 @@ static int teardown(void **state) {
return 0;
}
int check_file_full(const char *path)
static int check_file_full(const char *path)
{
return excludedFiles->fullPatternMatch(path, CSYNC_FTW_TYPE_FILE);
}
int check_dir_full(const char *path)
static int check_dir_full(const char *path)
{
return excludedFiles->fullPatternMatch(path, CSYNC_FTW_TYPE_DIR);
}
int check_file_traversal(const char *path)
static int check_file_traversal(const char *path)
{
return excludedFiles->traversalPatternMatch(path, CSYNC_FTW_TYPE_FILE);
}
int check_dir_traversal(const char *path)
static int check_dir_traversal(const char *path)
{
return excludedFiles->traversalPatternMatch(path, CSYNC_FTW_TYPE_DIR);
}
@ -107,18 +111,19 @@ int check_dir_traversal(const char *path)
static void check_csync_exclude_add(void **)
{
excludedFiles->addManualExclude("/tmp/check_csync1/*");
excludedFiles->prepare();
assert_int_equal(check_file_full("/tmp/check_csync1/foo"), CSYNC_FILE_EXCLUDE_LIST);
assert_int_equal(check_file_full("/tmp/check_csync2/foo"), CSYNC_NOT_EXCLUDED);
assert_true(excludedFiles->_allExcludes.contains("/tmp/check_csync1/*"));
excludedFiles->prepare();
assert_true(excludedFiles->_nonRegexExcludes.contains("/tmp/check_csync1/*"));
assert_false(excludedFiles->_bnameRegexFileDir.pattern().contains("csync1"));
assert_true(excludedFiles->_nonRegexExcludes.isEmpty());
assert_true(excludedFiles->_fullRegexFile.pattern().contains("csync1"));
assert_false(excludedFiles->_bnameActivationRegexFile.pattern().contains("csync1"));
excludedFiles->addManualExclude("foo");
excludedFiles->prepare();
assert_true(excludedFiles->_nonRegexExcludes.size() == 1);
assert_true(excludedFiles->_bnameRegexFileDir.pattern().contains("foo"));
assert_true(excludedFiles->_nonRegexExcludes.isEmpty());
assert_true(excludedFiles->_bnameActivationRegexFile.pattern().contains("foo"));
}
static void check_csync_excluded(void **)
@ -515,17 +520,21 @@ static void check_csync_exclude_expand_escapes(void **state)
SAFE_FREE(str);
}
}; // class ExcludedFilesTest
int torture_run_tests(void)
{
typedef ExcludedFilesTest T;
const struct CMUnitTest tests[] = {
cmocka_unit_test_setup_teardown(check_csync_exclude_add, setup, teardown),
cmocka_unit_test_setup_teardown(check_csync_excluded, setup_init, teardown),
cmocka_unit_test_setup_teardown(check_csync_excluded_traversal, setup_init, teardown),
cmocka_unit_test_setup_teardown(check_csync_dir_only, setup_init, teardown),
cmocka_unit_test_setup_teardown(check_csync_pathes, setup_init, teardown),
cmocka_unit_test_setup_teardown(check_csync_is_windows_reserved_word, setup_init, teardown),
cmocka_unit_test_setup_teardown(check_csync_excluded_performance, setup_init, teardown),
cmocka_unit_test(check_csync_exclude_expand_escapes),
cmocka_unit_test_setup_teardown(T::check_csync_exclude_add, T::setup, T::teardown),
cmocka_unit_test_setup_teardown(T::check_csync_excluded, T::setup_init, T::teardown),
cmocka_unit_test_setup_teardown(T::check_csync_excluded_traversal, T::setup_init, T::teardown),
cmocka_unit_test_setup_teardown(T::check_csync_dir_only, T::setup, T::teardown),
cmocka_unit_test_setup_teardown(T::check_csync_pathes, T::setup_init, T::teardown),
cmocka_unit_test_setup_teardown(T::check_csync_is_windows_reserved_word, T::setup_init, T::teardown),
cmocka_unit_test_setup_teardown(T::check_csync_excluded_performance, T::setup_init, T::teardown),
cmocka_unit_test(T::check_csync_exclude_expand_escapes),
};
return cmocka_run_group_tests(tests, NULL, NULL);