Pull request: 5258-good-old-filters

Merge in DNS/adguard-home from 5258-good-old-filters to master

Updates #5258.

Squashed commit of the following:

commit 8555e685a104713e552f017de63281749f41b6b2
Author: Eugene Burkov <E.Burkov@AdGuard.COM>
Date:   Tue Dec 20 16:07:52 2022 +0400

    filtering: imp tests, docs

commit 2ecfc18fc69850a06461620a24527158603cd7b8
Author: Eugene Burkov <E.Burkov@AdGuard.COM>
Date:   Tue Dec 20 11:00:59 2022 +0400

    filtering: fix docs

commit 1ea8d45a85f3fb6794b44134e8fdcbe2044d2199
Author: Eugene Burkov <E.Burkov@AdGuard.COM>
Date:   Mon Dec 19 23:19:37 2022 +0400

    filtering: imp naming, docs

commit c52a3bba48738c002111c234fb4c312380e49cfc
Author: Eugene Burkov <E.Burkov@AdGuard.COM>
Date:   Mon Dec 19 23:13:37 2022 +0400

    filtering: imp logic

commit 3ad4276ace40f05db47b49fb033d1b0fa208ec4e
Author: Eugene Burkov <E.Burkov@AdGuard.COM>
Date:   Mon Dec 19 17:49:15 2022 +0400

    filtering: imp docs

commit 1bc3cc443bc8ec988532effaaf5f50474a1a69ab
Author: Eugene Burkov <E.Burkov@AdGuard.COM>
Date:   Mon Dec 19 17:45:37 2022 +0400

    filtering: imp more

commit 7908339a0c9fcc29e8fe12b6c5d8c14bbfa51364
Author: Eugene Burkov <E.Burkov@AdGuard.COM>
Date:   Mon Dec 19 16:57:42 2022 +0400

    filtering: imp code

commit 21bbd18b4ded83f354210ac32010d8fd1073452f
Author: Eugene Burkov <E.Burkov@AdGuard.COM>
Date:   Mon Dec 19 12:11:21 2022 +0400

    filtering: imp src reading
This commit is contained in:
Eugene Burkov 2022-12-20 16:40:42 +03:00
parent de08ef0077
commit 48cbc7bdf0
4 changed files with 168 additions and 166 deletions

View file

@ -2,6 +2,7 @@ package filtering
import (
"bufio"
"bytes"
"fmt"
"hash/crc32"
"io"
@ -12,6 +13,7 @@ import (
"strings"
"time"
"github.com/AdguardTeam/AdGuardHome/internal/aghalg"
"github.com/AdguardTeam/golibs/errors"
"github.com/AdguardTeam/golibs/log"
"github.com/AdguardTeam/golibs/stringutil"
@ -134,8 +136,8 @@ func (d *DNSFilter) filterSetProperties(
// TODO(e.burkov): The validation of the contents of the new URL is
// currently skipped if the rule list is disabled. This makes it
// possible to set a bad rules source, but the validation should still
// kick in when the filter is enabled. Consider making changing this
// behavior to be stricter.
// kick in when the filter is enabled. Consider changing this behavior
// to be stricter.
filt.unload()
}
@ -269,10 +271,10 @@ func (d *DNSFilter) periodicallyRefreshFilters() {
// already going on.
//
// TODO(e.burkov): Get rid of the concurrency pattern which requires the
// sync.Mutex.TryLock.
// [sync.Mutex.TryLock].
func (d *DNSFilter) tryRefreshFilters(block, allow, force bool) (updated int, isNetworkErr, ok bool) {
if ok = d.refreshLock.TryLock(); !ok {
return 0, false, ok
return 0, false, false
}
defer d.refreshLock.Unlock()
@ -427,52 +429,124 @@ func (d *DNSFilter) refreshFiltersIntl(block, allow, force bool) (int, bool) {
return updNum, false
}
// Allows printable UTF-8 text with CR, LF, TAB characters
func isPrintableText(data []byte, len int) bool {
for i := 0; i < len; i++ {
c := data[i]
// isPrintableText returns true if data is printable UTF-8 text with CR, LF, TAB
// characters.
//
// TODO(e.burkov): Investigate the purpose of this and improve the
// implementation. Perhaps, use something from the unicode package.
func isPrintableText(data string) (ok bool) {
for _, c := range []byte(data) {
if (c >= ' ' && c != 0x7f) || c == '\n' || c == '\r' || c == '\t' {
continue
}
return false
}
return true
}
// A helper function that parses filter contents and returns a number of rules and a filter name (if there's any)
func (d *DNSFilter) parseFilterContents(file io.Reader) (int, uint32, string) {
rulesCount := 0
name := ""
seenTitle := false
r := bufio.NewReader(file)
checksum := uint32(0)
// scanLinesWithBreak is essentially a [bufio.ScanLines] which keeps trailing
// line breaks.
func scanLinesWithBreak(data []byte, atEOF bool) (advance int, token []byte, err error) {
if atEOF && len(data) == 0 {
return 0, nil, nil
}
for {
line, err := r.ReadString('\n')
checksum = crc32.Update(checksum, crc32.IEEETable, []byte(line))
if i := bytes.IndexByte(data, '\n'); i >= 0 {
return i + 1, data[0 : i+1], nil
}
line = strings.TrimSpace(line)
if len(line) == 0 {
//
} else if line[0] == '!' {
m := d.filterTitleRegexp.FindAllStringSubmatch(line, -1)
if len(m) > 0 && len(m[0]) >= 2 && !seenTitle {
name = m[0][1]
seenTitle = true
}
if atEOF {
return len(data), data, nil
}
} else if line[0] == '#' {
//
} else {
rulesCount++
// Request more data.
return 0, nil, nil
}
// parseFilter copies filter's content from src to dst and returns the number of
// rules, name, number of bytes written, checksum, and title of the parsed list.
// dst must not be nil.
func (d *DNSFilter) parseFilter(
src io.Reader,
dst io.Writer,
) (rulesNum, written int, checksum uint32, title string, err error) {
scanner := bufio.NewScanner(src)
scanner.Split(scanLinesWithBreak)
titleFound := false
for n := 0; scanner.Scan(); written += n {
line := scanner.Text()
var isRule bool
var likelyTitle string
isRule, likelyTitle, err = d.parseFilterLine(line, !titleFound, written == 0)
if err != nil {
return 0, written, 0, "", err
}
if isRule {
rulesNum++
} else if likelyTitle != "" {
title, titleFound = likelyTitle, true
}
checksum = crc32.Update(checksum, crc32.IEEETable, []byte(line))
n, err = dst.Write([]byte(line))
if err != nil {
break
return 0, written, 0, "", fmt.Errorf("writing filter line: %w", err)
}
}
return rulesCount, checksum, name
if err = scanner.Err(); err != nil {
return 0, written, 0, "", fmt.Errorf("scanning filter contents: %w", err)
}
return rulesNum, written, checksum, title, nil
}
// parseFilterLine returns true if the passed line is a rule. line is
// considered a rule if it's not a comment and contains no title.
func (d *DNSFilter) parseFilterLine(
line string,
lookForTitle bool,
testHTML bool,
) (isRule bool, title string, err error) {
if !isPrintableText(line) {
return false, "", errors.Error("filter contains non-printable characters")
}
line = strings.TrimSpace(line)
if line == "" || line[0] == '#' {
return false, "", nil
}
if testHTML && isHTML(line) {
return false, "", errors.Error("data is HTML, not plain text")
}
if line[0] == '!' && lookForTitle {
match := d.filterTitleRegexp.FindStringSubmatch(line)
if len(match) > 1 {
title = match[1]
}
return false, title, nil
}
return true, "", nil
}
// isHTML returns true if the line contains HTML tags instead of plain text.
// line shouldn have no leading space symbols.
//
// TODO(ameshkov): It actually gives too much false-positives. Perhaps, just
// check if trimmed string begins with angle bracket.
func isHTML(line string) (ok bool) {
line = strings.ToLower(line)
return strings.HasPrefix(line, "<html") || strings.HasPrefix(line, "<!doctype")
}
// Perform upgrade on a filter and update LastUpdated value
@ -485,57 +559,10 @@ func (d *DNSFilter) update(filter *FilterYAML) (bool, error) {
log.Error("os.Chtimes(): %v", e)
}
}
return b, err
}
func (d *DNSFilter) read(reader io.Reader, tmpFile *os.File, filter *FilterYAML) (int, error) {
htmlTest := true
firstChunk := make([]byte, 4*1024)
firstChunkLen := 0
buf := make([]byte, 64*1024)
total := 0
for {
n, err := reader.Read(buf)
total += n
if htmlTest {
num := len(firstChunk) - firstChunkLen
if n < num {
num = n
}
copied := copy(firstChunk[firstChunkLen:], buf[:num])
firstChunkLen += copied
if firstChunkLen == len(firstChunk) || err == io.EOF {
if !isPrintableText(firstChunk, firstChunkLen) {
return total, fmt.Errorf("data contains non-printable characters")
}
s := strings.ToLower(string(firstChunk))
if strings.Contains(s, "<html") || strings.Contains(s, "<!doctype") {
return total, fmt.Errorf("data is HTML, not plain text")
}
htmlTest = false
firstChunk = nil
}
}
_, err2 := tmpFile.Write(buf[:n])
if err2 != nil {
return total, err2
}
if err == io.EOF {
return total, nil
}
if err != nil {
log.Printf("Couldn't fetch filter contents from URL %s, skipping: %s", filter.URL, err)
return total, err
}
}
}
// finalizeUpdate closes and gets rid of temporary file f with filter's content
// according to updated. It also saves new values of flt's name, rules number
// and checksum if sucсeeded.
@ -552,7 +579,8 @@ func (d *DNSFilter) finalizeUpdate(
// Close the file before renaming it because it's required on Windows.
//
// See https://github.com/adguardTeam/adGuardHome/issues/1553.
if err = file.Close(); err != nil {
err = file.Close()
if err != nil {
return fmt.Errorf("closing temporary file: %w", err)
}
@ -564,38 +592,18 @@ func (d *DNSFilter) finalizeUpdate(
log.Printf("saving filter %d contents to: %s", flt.ID, flt.Path(d.DataDir))
if err = os.Rename(tmpFileName, flt.Path(d.DataDir)); err != nil {
// Don't use renamio or maybe packages, since those will require loading the
// whole filter content to the memory on Windows.
err = os.Rename(tmpFileName, flt.Path(d.DataDir))
if err != nil {
return errors.WithDeferred(err, os.Remove(tmpFileName))
}
flt.Name = stringutil.Coalesce(flt.Name, name)
flt.checksum = cs
flt.RulesCount = rnum
flt.Name, flt.checksum, flt.RulesCount = aghalg.Coalesce(flt.Name, name), cs, rnum
return nil
}
// processUpdate copies filter's content from src to dst and returns the name,
// rules number, and checksum for it. It also returns the number of bytes read
// from src.
func (d *DNSFilter) processUpdate(
src io.Reader,
dst *os.File,
flt *FilterYAML,
) (name string, rnum int, cs uint32, n int, err error) {
if n, err = d.read(src, dst, flt); err != nil {
return "", 0, 0, 0, err
}
if _, err = dst.Seek(0, io.SeekStart); err != nil {
return "", 0, 0, 0, err
}
rnum, cs, name = d.parseFilterContents(dst)
return name, rnum, cs, n, nil
}
// updateIntl updates the flt rewriting it's actual file. It returns true if
// the actual update has been performed.
func (d *DNSFilter) updateIntl(flt *FilterYAML) (ok bool, err error) {
@ -612,31 +620,21 @@ func (d *DNSFilter) updateIntl(flt *FilterYAML) (ok bool, err error) {
}
defer func() {
err = errors.WithDeferred(err, d.finalizeUpdate(tmpFile, flt, ok, name, rnum, cs))
ok = ok && err == nil
if ok {
if ok && err == nil {
log.Printf("updated filter %d: %d bytes, %d rules", flt.ID, n, rnum)
}
}()
// Change the default 0o600 permission to something more acceptable by
// end users.
// Change the default 0o600 permission to something more acceptable by end
// users.
//
// See https://github.com/AdguardTeam/AdGuardHome/issues/3198.
if err = tmpFile.Chmod(0o644); err != nil {
return false, fmt.Errorf("changing file mode: %w", err)
}
var r io.Reader
if filepath.IsAbs(flt.URL) {
var file io.ReadCloser
file, err = os.Open(flt.URL)
if err != nil {
return false, fmt.Errorf("open file: %w", err)
}
defer func() { err = errors.WithDeferred(err, file.Close()) }()
r = file
} else {
var rc io.ReadCloser
if !filepath.IsAbs(flt.URL) {
var resp *http.Response
resp, err = d.HTTPClient.Get(flt.URL)
if err != nil {
@ -649,24 +647,30 @@ func (d *DNSFilter) updateIntl(flt *FilterYAML) (ok bool, err error) {
if resp.StatusCode != http.StatusOK {
log.Printf("got status code %d from %s, skip", resp.StatusCode, flt.URL)
return false, fmt.Errorf("got status code != 200: %d", resp.StatusCode)
return false, fmt.Errorf("got status code %d, want %d", resp.StatusCode, http.StatusOK)
}
r = resp.Body
rc = resp.Body
} else {
rc, err = os.Open(flt.URL)
if err != nil {
return false, fmt.Errorf("open file: %w", err)
}
defer func() { err = errors.WithDeferred(err, rc.Close()) }()
}
name, rnum, cs, n, err = d.processUpdate(r, tmpFile, flt)
rnum, n, cs, name, err = d.parseFilter(rc, tmpFile)
return cs != flt.checksum, err
return cs != flt.checksum && err == nil, err
}
// loads filter contents from the file in dataDir
func (d *DNSFilter) load(filter *FilterYAML) (err error) {
filterFilePath := filter.Path(d.DataDir)
func (d *DNSFilter) load(flt *FilterYAML) (err error) {
fileName := flt.Path(d.DataDir)
log.Tracef("filtering: loading filter %d from %s", filter.ID, filterFilePath)
log.Debug("filtering: loading filter %d from %s", flt.ID, fileName)
file, err := os.Open(filterFilePath)
file, err := os.Open(fileName)
if errors.Is(err, os.ErrNotExist) {
// Do nothing, file doesn't exist.
return nil
@ -680,13 +684,14 @@ func (d *DNSFilter) load(filter *FilterYAML) (err error) {
return fmt.Errorf("getting filter file stat: %w", err)
}
log.Tracef("filtering: File %s, id %d, length %d", filterFilePath, filter.ID, st.Size())
log.Debug("filtering: file %s, id %d, length %d", fileName, flt.ID, st.Size())
rulesCount, checksum, _ := d.parseFilterContents(file)
rulesCount, _, checksum, _, err := d.parseFilter(file, io.Discard)
if err != nil {
return fmt.Errorf("parsing filter file: %w", err)
}
filter.RulesCount = rulesCount
filter.checksum = checksum
filter.LastUpdated = st.ModTime()
flt.RulesCount, flt.checksum, flt.LastUpdated = rulesCount, checksum, st.ModTime()
return nil
}

View file

@ -4,33 +4,23 @@ import (
"io/fs"
"net"
"net/http"
"net/netip"
"net/url"
"os"
"path/filepath"
"testing"
"time"
"github.com/AdguardTeam/golibs/netutil"
"github.com/AdguardTeam/AdGuardHome/internal/aghhttp"
"github.com/AdguardTeam/golibs/testutil"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// serveFiltersLocally is a helper that concurrently listens on a free port to
// respond with fltContent. It also gracefully closes the listener when the
// test under t finishes.
func serveFiltersLocally(t *testing.T, fltContent []byte) (ipp netip.AddrPort) {
// serveHTTPLocally starts a new HTTP server, that handles its index with h. It
// also gracefully closes the listener when the test under t finishes.
func serveHTTPLocally(t *testing.T, h http.Handler) (urlStr string) {
t.Helper()
h := http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
pt := testutil.PanicT{}
n, werr := w.Write(fltContent)
require.NoError(pt, werr)
require.Equal(pt, len(fltContent), n)
})
l, err := net.Listen("tcp", ":0")
require.NoError(t, err)
@ -38,9 +28,26 @@ func serveFiltersLocally(t *testing.T, fltContent []byte) (ipp netip.AddrPort) {
testutil.CleanupAndRequireSuccess(t, l.Close)
addr := l.Addr()
require.IsType(t, new(net.TCPAddr), addr)
require.IsType(t, (*net.TCPAddr)(nil), addr)
return netip.AddrPortFrom(netutil.IPv4Localhost(), uint16(addr.(*net.TCPAddr).Port))
return (&url.URL{
Scheme: aghhttp.SchemeHTTP,
Host: addr.String(),
}).String()
}
// serveFiltersLocally is a helper that concurrently listens on a free port to
// respond with fltContent.
func serveFiltersLocally(t *testing.T, fltContent []byte) (urlStr string) {
t.Helper()
return serveHTTPLocally(t, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
pt := testutil.PanicT{}
n, werr := w.Write(fltContent)
require.NoError(pt, werr)
require.Equal(pt, len(fltContent), n)
}))
}
func TestFilters(t *testing.T) {
@ -65,10 +72,7 @@ func TestFilters(t *testing.T) {
require.NoError(t, err)
f := &FilterYAML{
URL: (&url.URL{
Scheme: "http",
Host: addr.String(),
}).String(),
URL: addr,
}
updateAndAssert := func(t *testing.T, want require.BoolAssertionFunc, wantRulesCount int) {
@ -103,11 +107,7 @@ func TestFilters(t *testing.T) {
anotherContent := []byte(`||example.com^`)
oldURL := f.URL
ipp := serveFiltersLocally(t, anotherContent)
f.URL = (&url.URL{
Scheme: "http",
Host: ipp.String(),
}).String()
f.URL = serveFiltersLocally(t, anotherContent)
t.Cleanup(func() { f.URL = oldURL })
updateAndAssert(t, require.True, 1)

View file

@ -190,6 +190,8 @@ type DNSFilter struct {
// filterTitleRegexp is the regular expression to retrieve a name of a
// filter list.
//
// TODO(e.burkov): Don't use regexp for such a simple text processing task.
filterTitleRegexp *regexp.Regexp
hostCheckers []hostChecker

View file

@ -5,7 +5,6 @@ import (
"encoding/json"
"net/http"
"net/http/httptest"
"net/url"
"testing"
"time"
@ -30,11 +29,7 @@ func TestDNSFilter_handleFilteringSetURL(t *testing.T) {
endpoint: &badRulesEndpoint,
content: []byte(`<html></html>`),
}} {
ipp := serveFiltersLocally(t, rulesSource.content)
*rulesSource.endpoint = (&url.URL{
Scheme: "http",
Host: ipp.String(),
}).String()
*rulesSource.endpoint = serveFiltersLocally(t, rulesSource.content)
}
testCases := []struct {