From 48cbc7bdf0b031725e99dce9f7be23a6df3ff49a Mon Sep 17 00:00:00 2001 From: Eugene Burkov Date: Tue, 20 Dec 2022 16:40:42 +0300 Subject: [PATCH] Pull request: 5258-good-old-filters Merge in DNS/adguard-home from 5258-good-old-filters to master Updates #5258. Squashed commit of the following: commit 8555e685a104713e552f017de63281749f41b6b2 Author: Eugene Burkov Date: Tue Dec 20 16:07:52 2022 +0400 filtering: imp tests, docs commit 2ecfc18fc69850a06461620a24527158603cd7b8 Author: Eugene Burkov Date: Tue Dec 20 11:00:59 2022 +0400 filtering: fix docs commit 1ea8d45a85f3fb6794b44134e8fdcbe2044d2199 Author: Eugene Burkov Date: Mon Dec 19 23:19:37 2022 +0400 filtering: imp naming, docs commit c52a3bba48738c002111c234fb4c312380e49cfc Author: Eugene Burkov Date: Mon Dec 19 23:13:37 2022 +0400 filtering: imp logic commit 3ad4276ace40f05db47b49fb033d1b0fa208ec4e Author: Eugene Burkov Date: Mon Dec 19 17:49:15 2022 +0400 filtering: imp docs commit 1bc3cc443bc8ec988532effaaf5f50474a1a69ab Author: Eugene Burkov Date: Mon Dec 19 17:45:37 2022 +0400 filtering: imp more commit 7908339a0c9fcc29e8fe12b6c5d8c14bbfa51364 Author: Eugene Burkov Date: Mon Dec 19 16:57:42 2022 +0400 filtering: imp code commit 21bbd18b4ded83f354210ac32010d8fd1073452f Author: Eugene Burkov Date: Mon Dec 19 12:11:21 2022 +0400 filtering: imp src reading --- internal/filtering/filter.go | 275 +++++++++++++++--------------- internal/filtering/filter_test.go | 50 +++--- internal/filtering/filtering.go | 2 + internal/filtering/http_test.go | 7 +- 4 files changed, 168 insertions(+), 166 deletions(-) diff --git a/internal/filtering/filter.go b/internal/filtering/filter.go index 070ce73c..953d105b 100644 --- a/internal/filtering/filter.go +++ b/internal/filtering/filter.go @@ -2,6 +2,7 @@ package filtering import ( "bufio" + "bytes" "fmt" "hash/crc32" "io" @@ -12,6 +13,7 @@ import ( "strings" "time" + "github.com/AdguardTeam/AdGuardHome/internal/aghalg" "github.com/AdguardTeam/golibs/errors" "github.com/AdguardTeam/golibs/log" "github.com/AdguardTeam/golibs/stringutil" @@ -134,8 +136,8 @@ func (d *DNSFilter) filterSetProperties( // TODO(e.burkov): The validation of the contents of the new URL is // currently skipped if the rule list is disabled. This makes it // possible to set a bad rules source, but the validation should still - // kick in when the filter is enabled. Consider making changing this - // behavior to be stricter. + // kick in when the filter is enabled. Consider changing this behavior + // to be stricter. filt.unload() } @@ -269,10 +271,10 @@ func (d *DNSFilter) periodicallyRefreshFilters() { // already going on. // // TODO(e.burkov): Get rid of the concurrency pattern which requires the -// sync.Mutex.TryLock. +// [sync.Mutex.TryLock]. func (d *DNSFilter) tryRefreshFilters(block, allow, force bool) (updated int, isNetworkErr, ok bool) { if ok = d.refreshLock.TryLock(); !ok { - return 0, false, ok + return 0, false, false } defer d.refreshLock.Unlock() @@ -427,52 +429,124 @@ func (d *DNSFilter) refreshFiltersIntl(block, allow, force bool) (int, bool) { return updNum, false } -// Allows printable UTF-8 text with CR, LF, TAB characters -func isPrintableText(data []byte, len int) bool { - for i := 0; i < len; i++ { - c := data[i] +// isPrintableText returns true if data is printable UTF-8 text with CR, LF, TAB +// characters. +// +// TODO(e.burkov): Investigate the purpose of this and improve the +// implementation. Perhaps, use something from the unicode package. +func isPrintableText(data string) (ok bool) { + for _, c := range []byte(data) { if (c >= ' ' && c != 0x7f) || c == '\n' || c == '\r' || c == '\t' { continue } + return false } + return true } -// A helper function that parses filter contents and returns a number of rules and a filter name (if there's any) -func (d *DNSFilter) parseFilterContents(file io.Reader) (int, uint32, string) { - rulesCount := 0 - name := "" - seenTitle := false - r := bufio.NewReader(file) - checksum := uint32(0) +// scanLinesWithBreak is essentially a [bufio.ScanLines] which keeps trailing +// line breaks. +func scanLinesWithBreak(data []byte, atEOF bool) (advance int, token []byte, err error) { + if atEOF && len(data) == 0 { + return 0, nil, nil + } - for { - line, err := r.ReadString('\n') - checksum = crc32.Update(checksum, crc32.IEEETable, []byte(line)) + if i := bytes.IndexByte(data, '\n'); i >= 0 { + return i + 1, data[0 : i+1], nil + } - line = strings.TrimSpace(line) - if len(line) == 0 { - // - } else if line[0] == '!' { - m := d.filterTitleRegexp.FindAllStringSubmatch(line, -1) - if len(m) > 0 && len(m[0]) >= 2 && !seenTitle { - name = m[0][1] - seenTitle = true - } + if atEOF { + return len(data), data, nil + } - } else if line[0] == '#' { - // - } else { - rulesCount++ + // Request more data. + return 0, nil, nil +} + +// parseFilter copies filter's content from src to dst and returns the number of +// rules, name, number of bytes written, checksum, and title of the parsed list. +// dst must not be nil. +func (d *DNSFilter) parseFilter( + src io.Reader, + dst io.Writer, +) (rulesNum, written int, checksum uint32, title string, err error) { + scanner := bufio.NewScanner(src) + scanner.Split(scanLinesWithBreak) + + titleFound := false + for n := 0; scanner.Scan(); written += n { + line := scanner.Text() + var isRule bool + var likelyTitle string + isRule, likelyTitle, err = d.parseFilterLine(line, !titleFound, written == 0) + if err != nil { + return 0, written, 0, "", err } + if isRule { + rulesNum++ + } else if likelyTitle != "" { + title, titleFound = likelyTitle, true + } + + checksum = crc32.Update(checksum, crc32.IEEETable, []byte(line)) + + n, err = dst.Write([]byte(line)) if err != nil { - break + return 0, written, 0, "", fmt.Errorf("writing filter line: %w", err) } } - return rulesCount, checksum, name + if err = scanner.Err(); err != nil { + return 0, written, 0, "", fmt.Errorf("scanning filter contents: %w", err) + } + + return rulesNum, written, checksum, title, nil +} + +// parseFilterLine returns true if the passed line is a rule. line is +// considered a rule if it's not a comment and contains no title. +func (d *DNSFilter) parseFilterLine( + line string, + lookForTitle bool, + testHTML bool, +) (isRule bool, title string, err error) { + if !isPrintableText(line) { + return false, "", errors.Error("filter contains non-printable characters") + } + + line = strings.TrimSpace(line) + if line == "" || line[0] == '#' { + return false, "", nil + } + + if testHTML && isHTML(line) { + return false, "", errors.Error("data is HTML, not plain text") + } + + if line[0] == '!' && lookForTitle { + match := d.filterTitleRegexp.FindStringSubmatch(line) + if len(match) > 1 { + title = match[1] + } + + return false, title, nil + } + + return true, "", nil +} + +// isHTML returns true if the line contains HTML tags instead of plain text. +// line shouldn have no leading space symbols. +// +// TODO(ameshkov): It actually gives too much false-positives. Perhaps, just +// check if trimmed string begins with angle bracket. +func isHTML(line string) (ok bool) { + line = strings.ToLower(line) + + return strings.HasPrefix(line, "`), }} { - ipp := serveFiltersLocally(t, rulesSource.content) - *rulesSource.endpoint = (&url.URL{ - Scheme: "http", - Host: ipp.String(), - }).String() + *rulesSource.endpoint = serveFiltersLocally(t, rulesSource.content) } testCases := []struct {