From e5c780ee1947cca2fc8389c5b5adc8e458b50406 Mon Sep 17 00:00:00 2001 From: Christian Kamm Date: Wed, 29 Nov 2017 12:13:19 +0100 Subject: [PATCH] Excludes: Translate full-path patterns to regex Improves full matches by more than an order of magnitude and also improves speed of traversal matches by roughly 20%, judging by the check_csync_exclude performance test. --- src/csync/csync_exclude.cpp | 247 +++++++++++++----- src/csync/csync_exclude.h | 44 +++- .../csync/csync_tests/check_csync_exclude.cpp | 43 +-- 3 files changed, 244 insertions(+), 90 deletions(-) diff --git a/src/csync/csync_exclude.cpp b/src/csync/csync_exclude.cpp index b3220d59e..85869d5c0 100644 --- a/src/csync/csync_exclude.cpp +++ b/src/csync/csync_exclude.cpp @@ -463,18 +463,13 @@ void ExcludedFiles::clearManualExcludes() bool ExcludedFiles::reloadExcludeFiles() { _allExcludes.clear(); - _bnameRegexFileDir = QRegularExpression(); - bool success = true; foreach (const QString &file, _excludeFiles) { if (csync_exclude_load(file.toUtf8(), &_allExcludes) < 0) success = false; } - _allExcludes.append(_manualExcludes); - prepare(); - return success; } @@ -519,37 +514,46 @@ bool ExcludedFiles::isExcluded( CSYNC_EXCLUDE_TYPE ExcludedFiles::traversalPatternMatch(const char *path, int filetype) const { - CSYNC_EXCLUDE_TYPE match = CSYNC_NOT_EXCLUDED; - /* Check only static patterns and only with the reduced list which is empty usually */ - match = _csync_excluded_common(_nonRegexExcludes, path, filetype, false); - if (match != CSYNC_NOT_EXCLUDED) { + auto match = _csync_excluded_common(_nonRegexExcludes, path, filetype, false); + if (match != CSYNC_NOT_EXCLUDED) return match; - } + if (_allExcludes.isEmpty()) + return CSYNC_NOT_EXCLUDED; - if (!_allExcludes.isEmpty()) { - /* Now check with our optimized regexps */ - const char *bname = NULL; - /* split up the path */ - bname = strrchr(path, '/'); - if (bname) { - bname += 1; // don't include the / - } else { - bname = path; - } - QString p = QString::fromUtf8(bname); - QRegularExpressionMatch m; - if (filetype == CSYNC_FTW_TYPE_DIR) { - m = _bnameRegexDir.match(p); - } else { - m = _bnameRegexFileDir.match(p); - } - if (m.hasMatch()) { - if (!m.captured(1).isEmpty()) { - match = CSYNC_FILE_EXCLUDE_LIST; - } else if (!m.captured(2).isEmpty()) { - match = CSYNC_FILE_EXCLUDE_AND_REMOVE; - } + // Check the bname part of the path to see whether the full + // regex should be run. + + auto bname = strrchr(path, '/'); + if (bname) { + bname += 1; // don't include the / + } else { + bname = path; + } + QString bnameStr = QString::fromUtf8(bname); + + QRegularExpressionMatch m; + if (filetype == CSYNC_FTW_TYPE_DIR) { + m = _bnameActivationRegexDir.match(bnameStr); + } else { + m = _bnameActivationRegexFile.match(bnameStr); + } + if (!m.hasMatch()) + return match; + + // Now run the full match + + QString pathStr = QString::fromUtf8(path); + if (filetype == CSYNC_FTW_TYPE_DIR) { + m = _fullRegexDir.match(pathStr); + } else { + m = _fullRegexFile.match(pathStr); + } + if (m.hasMatch()) { + if (!m.captured(1).isEmpty()) { + match = CSYNC_FILE_EXCLUDE_LIST; + } else if (!m.captured(2).isEmpty()) { + match = CSYNC_FILE_EXCLUDE_AND_REMOVE; } } return match; @@ -557,8 +561,28 @@ CSYNC_EXCLUDE_TYPE ExcludedFiles::traversalPatternMatch(const char *path, int fi CSYNC_EXCLUDE_TYPE ExcludedFiles::fullPatternMatch(const char *path, int filetype) const { - // bname check must also apply to all path components - return _csync_excluded_common(_allExcludes, path, filetype, true); + /* Check only static patterns and only with the reduced list which is empty usually */ + auto match = _csync_excluded_common(_nonRegexExcludes, path, filetype, true); + if (match != CSYNC_NOT_EXCLUDED) + return match; + if (_allExcludes.isEmpty()) + return CSYNC_NOT_EXCLUDED; + + QString p = QString::fromUtf8(path); + QRegularExpressionMatch m; + if (filetype == CSYNC_FTW_TYPE_DIR) { + m = _fullRegexDir.match(p); + } else { + m = _fullRegexFile.match(p); + } + if (m.hasMatch()) { + if (!m.captured(1).isEmpty()) { + return CSYNC_FILE_EXCLUDE_LIST; + } else if (!m.captured(2).isEmpty()) { + return CSYNC_FILE_EXCLUDE_AND_REMOVE; + } + } + return CSYNC_NOT_EXCLUDED; } auto ExcludedFiles::csyncTraversalMatchFun() const @@ -567,10 +591,9 @@ auto ExcludedFiles::csyncTraversalMatchFun() const return [this](const char *path, int filetype) { return this->traversalPatternMatch(path, filetype); }; } -/* Only for bnames (not paths) */ -static QString convertToBnameRegexpSyntax(QString exclude) +static QString convertToRegexpSyntax(QString exclude) { - QString s = QRegularExpression::escape(exclude).replace("\\*", ".*").replace("\\?", "."); + QString s = QRegularExpression::escape(exclude).replace("\\*", "[^/]*").replace("\\?", "[^/]"); return s; } @@ -578,15 +601,45 @@ void ExcludedFiles::prepare() { _nonRegexExcludes.clear(); - // Start out with regexes that would match nothing - QString patternFileDirKeep = "a^"; - QString patternFileDirRemove = "a^"; - QString patternDirKeep = "a^"; - QString patternDirRemove = "a^"; - auto regexAppend = [](QString &pattern, QString v) { + // Build regular expressions for the different cases. + // + // To compose the _bnameActivationRegex and _fullRegex patterns we + // collect several subgroups of patterns here. + // + // * The "full" group will contain all patterns that contain a non-trailing + // slash. They only make sense in the fullRegex. + // * The "bname" group contains all patterns without a non-trailing slash. + // These need separate handling in the _fullRegex (slash-containing + // patterns must be anchored to the front, these don't need it) + // * The "bnameTrigger" group contains the bname part of all patterns in the + // "full" group. These and the "bname" group become _bnameActivationRegex. + // + // To complicate matters, the exclude patterns have two binary attributes + // meaning we'll end up with 4 variants: + // * "]" patterns mean "EXCLUDE_AND_REMOVE", they get collected in the + // pattern strings ending in "Remove". The others go to "Keep". + // * trailing-slash patterns match directories only. They get collected + // in the pattern strings saying "Dir", the others go into "FileDir" + // because they match files and directories. + + QString fullFileDirKeep; + QString fullFileDirRemove; + QString fullDirKeep; + QString fullDirRemove; + + QString bnameFileDirKeep; + QString bnameFileDirRemove; + QString bnameDirKeep; + QString bnameDirRemove; + + QString bnameTriggerFileDir; + QString bnameTriggerDir; + + auto regexAppend = [](QString &fileDirPattern, QString &dirPattern, const QString &appendMe, bool dirOnly) { + QString &pattern = dirOnly ? dirPattern : fileDirPattern; if (!pattern.isEmpty()) pattern.append("|"); - pattern.append(v); + pattern.append(appendMe); }; for (auto exclude : _allExcludes) { @@ -603,35 +656,99 @@ void ExcludedFiles::prepare() if (removeExcluded) exclude = exclude.mid(1); - /* If an exclude entry contains some fnmatch-ish characters, we use the C-style codepath without QRegularEpression */ - if (strchr(exclude, '/') || strchr(exclude, '[') || strchr(exclude, '{') || strchr(exclude, '\\')) { + bool fullPath = exclude.contains('/'); + + /* If an exclude entry contains some fnmatch-ish characters that + * we can't yet translate to regular expressions, we use the C-style + * fnmatch based codepath instead */ + if (strchr(exclude, '[') || strchr(exclude, '{') || strchr(exclude, '\\')) { _nonRegexExcludes.append(exclude); continue; } /* Use QRegularExpression, append to the right pattern */ - auto &patternFileDir = removeExcluded ? patternFileDirRemove : patternFileDirKeep; - auto &patternDir = removeExcluded ? patternDirRemove : patternDirKeep; + auto &bnameFileDir = removeExcluded ? bnameFileDirRemove : bnameFileDirKeep; + auto &bnameDir = removeExcluded ? bnameDirRemove : bnameDirKeep; + auto &fullFileDir = removeExcluded ? fullFileDirRemove : fullFileDirKeep; + auto &fullDir = removeExcluded ? fullDirRemove : fullDirKeep; - auto regexExclude = convertToBnameRegexpSyntax(QString::fromUtf8(exclude)); - // patterns always match against directories - regexAppend(patternDir, regexExclude); - // but some don't match against files - if (!matchDirOnly) - regexAppend(patternFileDir, regexExclude); + auto regexExclude = convertToRegexpSyntax(QString::fromUtf8(exclude)); + if (!fullPath) { + regexAppend(bnameFileDir, bnameDir, regexExclude, matchDirOnly); + } else { + regexAppend(fullFileDir, fullDir, regexExclude, matchDirOnly); + + // for activation, trigger on the 'bname' part of the full pattern + auto bnameExclude = exclude.mid(exclude.lastIndexOf('/') + 1); + auto regexBname = convertToRegexpSyntax(bnameExclude); + regexAppend(bnameTriggerFileDir, bnameTriggerDir, regexBname, matchDirOnly); + } } - _bnameRegexFileDir.setPattern( - "^(" + patternFileDirKeep + ")$|^(" + patternFileDirRemove + ")$"); - _bnameRegexDir.setPattern( - "^(" + patternDirKeep + ")$|^(" + patternDirRemove + ")$"); + // The empty pattern would match everything - change it to match-nothing + auto emptyMatchNothing = [](QString &pattern) { + if (pattern.isEmpty()) + pattern = "a^"; + }; + emptyMatchNothing(fullFileDirKeep); + emptyMatchNothing(fullFileDirRemove); + emptyMatchNothing(fullDirKeep); + emptyMatchNothing(fullDirRemove); - QRegularExpression::PatternOptions patternOptions = QRegularExpression::OptimizeOnFirstUsageOption; + emptyMatchNothing(bnameFileDirKeep); + emptyMatchNothing(bnameFileDirRemove); + emptyMatchNothing(bnameDirKeep); + emptyMatchNothing(bnameDirRemove); + + emptyMatchNothing(bnameTriggerFileDir); + emptyMatchNothing(bnameTriggerDir); + + // The bname activation regexe is applied to the bname only, so must be + // anchored in the beginning and in the end. It has the explicit triggers + // plus the bname-only patterns. Here we don't care about the remove/keep + // distinction. + _bnameActivationRegexFile.setPattern( + "^(?:" + bnameFileDirKeep + "|" + bnameFileDirRemove + "|" + bnameTriggerFileDir + ")$"); + _bnameActivationRegexDir.setPattern( + "^(?:" + bnameFileDirKeep + "|" + bnameFileDirRemove + + "|" + bnameDirKeep + "|" + bnameFileDirRemove + + "|" + bnameTriggerFileDir + "|" + bnameTriggerDir + ")$"); + + // The full regex has two captures, it's basic form is "(...)|(...)". The first + // capture has the keep/exclude-only patterns, the second the remove/exclude-and-remove + // patterns. + _fullRegexFile.setPattern( + QLatin1String("(") + // Full patterns are anchored to the beginning + + "^(?:" + fullFileDirKeep + ")(?:$|/)" + "|" + // Simple bname patterns can be any path component + + "(?:^|/)(?:" + bnameFileDirKeep + ")(?:$|/)" + "|" + // When checking a file for exclusion we must check all parent paths + // against the dir-only patterns as well. + + "(?:^|/)(?:" + bnameDirKeep + ")/" + + ")|(" + + "^(?:" + fullFileDirRemove + ")(?:$|/)" + "|" + + "(?:^|/)(?:" + bnameFileDirRemove + ")(?:$|/)" + "|" + + "(?:^|/)(?:" + bnameDirRemove + ")/" + + ")"); + _fullRegexDir.setPattern( + QLatin1String("(") + + "^(?:" + fullFileDirKeep + "|" + fullDirKeep + ")(?:$|/)" + "|" + + "(?:^|/)(?:" + bnameFileDirKeep + "|" + bnameDirKeep + ")(?:$|/)" + + ")|(" + + "^(?:" + fullFileDirRemove + "|" + fullDirRemove + ")(?:$|/)" + "|" + + "(?:^|/)(?:" + bnameFileDirRemove + "|" + bnameDirRemove + ")(?:$|/)" + + ")"); + + QRegularExpression::PatternOptions patternOptions = QRegularExpression::NoPatternOption; if (OCC::Utility::fsCasePreserving()) patternOptions |= QRegularExpression::CaseInsensitiveOption; - _bnameRegexFileDir.setPatternOptions(patternOptions); - _bnameRegexFileDir.optimize(); - _bnameRegexDir.setPatternOptions(patternOptions); - _bnameRegexDir.optimize(); + _bnameActivationRegexFile.setPatternOptions(patternOptions); + _bnameActivationRegexFile.optimize(); + _bnameActivationRegexDir.setPatternOptions(patternOptions); + _bnameActivationRegexDir.optimize(); + _fullRegexFile.setPatternOptions(patternOptions); + _fullRegexFile.optimize(); + _fullRegexDir.setPatternOptions(patternOptions); + _fullRegexDir.optimize(); } - diff --git a/src/csync/csync_exclude.h b/src/csync/csync_exclude.h index 08d0a63b1..1b36138dd 100644 --- a/src/csync/csync_exclude.h +++ b/src/csync/csync_exclude.h @@ -45,6 +45,8 @@ enum csync_exclude_type_e { }; typedef enum csync_exclude_type_e CSYNC_EXCLUDE_TYPE; +class ExcludedFilesTest; + /** * Manages file/directory exclusion. * @@ -114,11 +116,7 @@ public slots: */ bool reloadExcludeFiles(); -#ifdef CSYNC_TEST -public: -#else private: -#endif /** * @brief Match the exclude pattern against the full path. * @@ -147,8 +145,34 @@ private: CSYNC_EXCLUDE_TYPE traversalPatternMatch(const char *path, int filetype) const; /** - * Generate an optimized _regex for many of the patterns. The remaining - * patterns are put into _nonRegexExcludes. + * Generate an optimized regular expression for many of the patterns. + * + * The remaining patterns are put into _nonRegexExcludes. + * + * The optimization works in two steps: First, all supported patterns are put + * into _fullRegexFile/_fullRegexDir. These regexes can be applied to the full + * path to determine whether it is excluded or not. + * + * The second is a performance optimization. The particularly common use + * case for excludes during a sync run is "traversal": Instead of checking + * the full path every time, we check each parent path with the traversal + * function incrementally. + * + * Example: When the sync run eventually arrives at "a/b/c it can assume + * that the traversal matching has already been run on "a", "a/b" + * and just needs to run the traversal matcher on "a/b/c". + * + * The full matcher is equivalent to or-combining the traversal match results + * of all parent paths: + * full("a/b/c/d") == traversal("a") || traversal("a/b") || traversal("a/b/c") + * + * The traversal matcher can be extremely fast because it has a fast early-out + * case: It checks the bname part of the path against _bnameActivationRegex + * and only runs the full regex if the bname activation was triggered. + * + * Note: The traversal matcher will return not-excluded on some paths that the + * full matcher would exclude. Example: "b" is excluded. traversal("b/c") + * returns not-excluded because "c" isn't a bname activation pattern. */ void prepare(); @@ -165,8 +189,12 @@ private: QList _nonRegexExcludes; /// see prepare() - QRegularExpression _bnameRegexFileDir; - QRegularExpression _bnameRegexDir; + QRegularExpression _bnameActivationRegexFile; + QRegularExpression _bnameActivationRegexDir; + QRegularExpression _fullRegexFile; + QRegularExpression _fullRegexDir; + + friend class ExcludedFilesTest; }; #endif /* _CSYNC_EXCLUDE_H */ diff --git a/test/csync/csync_tests/check_csync_exclude.cpp b/test/csync/csync_tests/check_csync_exclude.cpp index c8e3a48ad..057e73404 100644 --- a/test/csync/csync_tests/check_csync_exclude.cpp +++ b/test/csync/csync_tests/check_csync_exclude.cpp @@ -31,6 +31,10 @@ ExcludedFiles *excludedFiles = nullptr; +class ExcludedFilesTest +{ +public: + static int setup(void **state) { CSYNC *csync; @@ -84,22 +88,22 @@ static int teardown(void **state) { return 0; } -int check_file_full(const char *path) +static int check_file_full(const char *path) { return excludedFiles->fullPatternMatch(path, CSYNC_FTW_TYPE_FILE); } -int check_dir_full(const char *path) +static int check_dir_full(const char *path) { return excludedFiles->fullPatternMatch(path, CSYNC_FTW_TYPE_DIR); } -int check_file_traversal(const char *path) +static int check_file_traversal(const char *path) { return excludedFiles->traversalPatternMatch(path, CSYNC_FTW_TYPE_FILE); } -int check_dir_traversal(const char *path) +static int check_dir_traversal(const char *path) { return excludedFiles->traversalPatternMatch(path, CSYNC_FTW_TYPE_DIR); } @@ -107,18 +111,19 @@ int check_dir_traversal(const char *path) static void check_csync_exclude_add(void **) { excludedFiles->addManualExclude("/tmp/check_csync1/*"); + excludedFiles->prepare(); assert_int_equal(check_file_full("/tmp/check_csync1/foo"), CSYNC_FILE_EXCLUDE_LIST); assert_int_equal(check_file_full("/tmp/check_csync2/foo"), CSYNC_NOT_EXCLUDED); assert_true(excludedFiles->_allExcludes.contains("/tmp/check_csync1/*")); - excludedFiles->prepare(); - assert_true(excludedFiles->_nonRegexExcludes.contains("/tmp/check_csync1/*")); - assert_false(excludedFiles->_bnameRegexFileDir.pattern().contains("csync1")); + assert_true(excludedFiles->_nonRegexExcludes.isEmpty()); + assert_true(excludedFiles->_fullRegexFile.pattern().contains("csync1")); + assert_false(excludedFiles->_bnameActivationRegexFile.pattern().contains("csync1")); excludedFiles->addManualExclude("foo"); excludedFiles->prepare(); - assert_true(excludedFiles->_nonRegexExcludes.size() == 1); - assert_true(excludedFiles->_bnameRegexFileDir.pattern().contains("foo")); + assert_true(excludedFiles->_nonRegexExcludes.isEmpty()); + assert_true(excludedFiles->_bnameActivationRegexFile.pattern().contains("foo")); } static void check_csync_excluded(void **) @@ -515,17 +520,21 @@ static void check_csync_exclude_expand_escapes(void **state) SAFE_FREE(str); } +}; // class ExcludedFilesTest + int torture_run_tests(void) { + typedef ExcludedFilesTest T; + const struct CMUnitTest tests[] = { - cmocka_unit_test_setup_teardown(check_csync_exclude_add, setup, teardown), - cmocka_unit_test_setup_teardown(check_csync_excluded, setup_init, teardown), - cmocka_unit_test_setup_teardown(check_csync_excluded_traversal, setup_init, teardown), - cmocka_unit_test_setup_teardown(check_csync_dir_only, setup_init, teardown), - cmocka_unit_test_setup_teardown(check_csync_pathes, setup_init, teardown), - cmocka_unit_test_setup_teardown(check_csync_is_windows_reserved_word, setup_init, teardown), - cmocka_unit_test_setup_teardown(check_csync_excluded_performance, setup_init, teardown), - cmocka_unit_test(check_csync_exclude_expand_escapes), + cmocka_unit_test_setup_teardown(T::check_csync_exclude_add, T::setup, T::teardown), + cmocka_unit_test_setup_teardown(T::check_csync_excluded, T::setup_init, T::teardown), + cmocka_unit_test_setup_teardown(T::check_csync_excluded_traversal, T::setup_init, T::teardown), + cmocka_unit_test_setup_teardown(T::check_csync_dir_only, T::setup, T::teardown), + cmocka_unit_test_setup_teardown(T::check_csync_pathes, T::setup_init, T::teardown), + cmocka_unit_test_setup_teardown(T::check_csync_is_windows_reserved_word, T::setup_init, T::teardown), + cmocka_unit_test_setup_teardown(T::check_csync_excluded_performance, T::setup_init, T::teardown), + cmocka_unit_test(T::check_csync_exclude_expand_escapes), }; return cmocka_run_group_tests(tests, NULL, NULL);