Pull request 2109: AG-20945-rule-list-filter

Squashed commit of the following:

commit 2da8c1754f349a9b7f8b629de8f0c892b9bae4dc
Merge: 5cea6a6a2 4fc6bf504
Author: Ainar Garipov <A.Garipov@AdGuard.COM>
Date:   Tue Dec 19 21:14:07 2023 +0300

    Merge branch 'master' into AG-20945-rule-list-filter

commit 5cea6a6a2bed88f645828ab5b4e7de09f9bf91ec
Author: Ainar Garipov <A.Garipov@AdGuard.COM>
Date:   Tue Dec 19 17:53:21 2023 +0300

    filtering/rulelist: imp docs, tests

commit f01434b37a
Author: Ainar Garipov <A.Garipov@AdGuard.COM>
Date:   Thu Dec 14 19:17:02 2023 +0300

    filtering/rulelist: imp names

commit fe2bf68e6b
Author: Ainar Garipov <A.Garipov@AdGuard.COM>
Date:   Thu Dec 14 19:07:53 2023 +0300

    all: go mod tidy

commit c7081d3486
Author: Ainar Garipov <A.Garipov@AdGuard.COM>
Date:   Thu Dec 14 19:03:33 2023 +0300

    filtering/rulelist: add filter
This commit is contained in:
Ainar Garipov 2023-12-19 21:21:17 +03:00
parent 4fc6bf504e
commit 0920bb99fe
8 changed files with 531 additions and 17 deletions

3
go.mod
View file

@ -9,6 +9,7 @@ require (
github.com/NYTimes/gziphandler v1.1.1
github.com/ameshkov/dnscrypt/v2 v2.2.7
github.com/bluele/gcache v0.0.2
github.com/c2h5oh/datasize v0.0.0-20220606134207-859f65c6625b
github.com/digineo/go-ipset/v2 v2.2.1
github.com/dimfeld/httptreemux/v5 v5.5.0
github.com/fsnotify/fsnotify v1.7.0
@ -16,7 +17,7 @@ require (
github.com/google/go-cmp v0.6.0
github.com/google/gopacket v1.1.19
github.com/google/renameio/v2 v2.0.0
github.com/google/uuid v1.4.0
github.com/google/uuid v1.5.0
github.com/insomniacslk/dhcp v0.0.0-20231206064809-8c70d406f6d2
github.com/josharian/native v1.1.1-0.20230202152459-5c7d0dd6ab86
github.com/kardianos/service v1.2.2

6
go.sum
View file

@ -18,6 +18,8 @@ github.com/beefsack/go-rate v0.0.0-20220214233405-116f4ca011a0 h1:0b2vaepXIfMsG+
github.com/beefsack/go-rate v0.0.0-20220214233405-116f4ca011a0/go.mod h1:6YNgTHLutezwnBvyneBbwvB8C82y3dcoOj5EQJIdGXA=
github.com/bluele/gcache v0.0.2 h1:WcbfdXICg7G/DGBh1PFfcirkWOQV+v077yF1pSy3DGw=
github.com/bluele/gcache v0.0.2/go.mod h1:m15KV+ECjptwSPxKhOhQoAFQVtUFjTVkc3H8o0t/fp0=
github.com/c2h5oh/datasize v0.0.0-20220606134207-859f65c6625b h1:6+ZFm0flnudZzdSE0JxlhR2hKnGPcNB35BjQf4RYQDY=
github.com/c2h5oh/datasize v0.0.0-20220606134207-859f65c6625b/go.mod h1:S/7n9copUssQ56c7aAgHqftWO4LTf4xY6CGWt8Bc+3M=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@ -46,8 +48,8 @@ github.com/google/pprof v0.0.0-20231205033806-a5a03c77bf08/go.mod h1:czg5+yv1E0Z
github.com/google/renameio/v2 v2.0.0 h1:UifI23ZTGY8Tt29JbYFiuyIU3eX+RNFtUwefq9qAhxg=
github.com/google/renameio/v2 v2.0.0/go.mod h1:BtmJXm5YlszgC+TD4HOEEUFgkJP3nLxehU6hfe7jRt4=
github.com/google/uuid v1.2.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/google/uuid v1.4.0 h1:MtMxsa51/r9yyhkyLsVeVt0B+BGQZzpQiTQ4eHZ8bc4=
github.com/google/uuid v1.4.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/google/uuid v1.5.0 h1:1p67kYwdtXjb0gL0BPiP1Av9wiZPo5A8z2cWkTZ+eyU=
github.com/google/uuid v1.5.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/hugelgupf/socketpair v0.0.0-20190730060125-05d35a94e714 h1:/jC7qQFrv8CrSJVmaolDVOxTfS9kc36uB6H40kdbQq8=
github.com/insomniacslk/dhcp v0.0.0-20231206064809-8c70d406f6d2 h1:9K06NfxkBh25x56yVhWWlKFE8YpicaSfHwoV8SFbueA=
github.com/insomniacslk/dhcp v0.0.0-20231206064809-8c70d406f6d2/go.mod h1:3A9PQ1cunSDF/1rbTq99Ts4pVnycWg+vlPkfeD2NLFI=

View file

@ -24,23 +24,25 @@ func validateFilterURL(urlStr string) (err error) {
if filepath.IsAbs(urlStr) {
_, err = os.Stat(urlStr)
if err != nil {
// Don't wrap the error since it's informative enough as is.
return err
}
return nil
// Don't wrap the error since it's informative enough as is.
return err
}
u, err := url.ParseRequestURI(urlStr)
if err != nil {
// Don't wrap the error since it's informative enough as is.
return err
} else if s := u.Scheme; s != aghhttp.SchemeHTTP && s != aghhttp.SchemeHTTPS {
}
if s := u.Scheme; s != aghhttp.SchemeHTTP && s != aghhttp.SchemeHTTPS {
return &url.Error{
Op: "Check scheme",
URL: urlStr,
Err: fmt.Errorf("only %v allowed", []string{aghhttp.SchemeHTTP, aghhttp.SchemeHTTPS}),
Err: fmt.Errorf("only %v allowed", []string{
aghhttp.SchemeHTTP,
aghhttp.SchemeHTTPS,
}),
}
}

View file

@ -0,0 +1,338 @@
package rulelist
import (
"bytes"
"context"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"time"
"github.com/AdguardTeam/AdGuardHome/internal/aghrenameio"
"github.com/AdguardTeam/golibs/errors"
"github.com/AdguardTeam/golibs/ioutil"
"github.com/AdguardTeam/golibs/log"
"github.com/AdguardTeam/urlfilter/filterlist"
"github.com/c2h5oh/datasize"
)
// Filter contains information about a single rule-list filter.
//
// TODO(a.garipov): Use.
type Filter struct {
// url is the URL of this rule list. Supported schemes are:
// - http
// - https
// - file
url *url.URL
// ruleList is the last successfully compiled [filterlist.RuleList].
ruleList filterlist.RuleList
// updated is the time of the last successful update.
updated time.Time
// name is the human-readable name of this rule-list filter.
name string
// uid is the unique ID of this rule-list filter.
uid UID
// urlFilterID is used for working with package urlfilter.
urlFilterID URLFilterID
// rulesCount contains the number of rules in this rule-list filter.
rulesCount int
// checksum is a CRC32 hash used to quickly check if the rules within a list
// file have changed.
checksum uint32
// enabled, if true, means that this rule-list filter is used for filtering.
//
// TODO(a.garipov): Take into account.
enabled bool
}
// FilterConfig contains the configuration for a [Filter].
type FilterConfig struct {
// URL is the URL of this rule-list filter. Supported schemes are:
// - http
// - https
// - file
URL *url.URL
// Name is the human-readable name of this rule-list filter. If not set, it
// is either taken from the rule-list data or generated synthetically from
// the UID.
Name string
// UID is the unique ID of this rule-list filter.
UID UID
// URLFilterID is used for working with package urlfilter.
URLFilterID URLFilterID
// Enabled, if true, means that this rule-list filter is used for filtering.
Enabled bool
}
// NewFilter creates a new rule-list filter. The filter is not refreshed, so a
// refresh should be performed before use.
func NewFilter(c *FilterConfig) (f *Filter, err error) {
if c.URL == nil {
return nil, errors.Error("no url")
}
switch s := c.URL.Scheme; s {
case "http", "https", "file":
// Go on.
default:
return nil, fmt.Errorf("bad url scheme: %q", s)
}
return &Filter{
url: c.URL,
name: c.Name,
uid: c.UID,
urlFilterID: c.URLFilterID,
enabled: c.Enabled,
}, nil
}
// Refresh updates the data in the rule-list filter. parseBuf is the initial
// buffer used to parse information from the data. cli and maxSize are only
// used when f is a URL-based list.
func (f *Filter) Refresh(
ctx context.Context,
parseBuf []byte,
cli *http.Client,
cacheDir string,
maxSize datasize.ByteSize,
) (parseRes *ParseResult, err error) {
cachePath := filepath.Join(cacheDir, f.uid.String()+".txt")
switch s := f.url.Scheme; s {
case "http", "https":
parseRes, err = f.setFromHTTP(ctx, parseBuf, cli, cachePath, maxSize.Bytes())
case "file":
parseRes, err = f.setFromFile(parseBuf, f.url.Path, cachePath)
default:
// Since the URL has been prevalidated in New, consider this a
// programmer error.
panic(fmt.Errorf("bad url scheme: %q", s))
}
if err != nil {
// Don't wrap the error, because it's informative enough as is.
return nil, err
}
if f.checksum != parseRes.Checksum {
f.checksum = parseRes.Checksum
f.rulesCount = parseRes.RulesCount
f.setName(parseRes.Title)
f.updated = time.Now()
}
return parseRes, nil
}
// setFromHTTP sets the rule-list filter's data from its URL. It also caches
// the data into a file.
func (f *Filter) setFromHTTP(
ctx context.Context,
parseBuf []byte,
cli *http.Client,
cachePath string,
maxSize uint64,
) (parseRes *ParseResult, err error) {
defer func() { err = errors.Annotate(err, "setting from http: %w") }()
text, parseRes, err := f.readFromHTTP(ctx, parseBuf, cli, cachePath, maxSize)
if err != nil {
// Don't wrap the error, because it's informative enough as is.
return nil, err
}
// TODO(a.garipov): Add filterlist.BytesRuleList.
f.ruleList = &filterlist.StringRuleList{
ID: f.urlFilterID,
RulesText: text,
IgnoreCosmetic: true,
}
return parseRes, nil
}
// readFromHTTP reads the data from the rule-list filter's URL into the cache
// file as well as returns it as a string. The data is filtered through a
// parser and so is free from comments, unnecessary whitespace, etc.
func (f *Filter) readFromHTTP(
ctx context.Context,
parseBuf []byte,
cli *http.Client,
cachePath string,
maxSize uint64,
) (text string, parseRes *ParseResult, err error) {
urlStr := f.url.String()
req, err := http.NewRequestWithContext(ctx, http.MethodGet, urlStr, nil)
if err != nil {
return "", nil, fmt.Errorf("making request for http url %q: %w", urlStr, err)
}
resp, err := cli.Do(req)
if err != nil {
return "", nil, fmt.Errorf("requesting from http url: %w", err)
}
defer func() { err = errors.WithDeferred(err, resp.Body.Close()) }()
// TODO(a.garipov): Use [agdhttp.CheckStatus] when it's moved to golibs.
if resp.StatusCode != http.StatusOK {
return "", nil, fmt.Errorf("got status code %d, want %d", resp.StatusCode, http.StatusOK)
}
fltFile, err := aghrenameio.NewPendingFile(cachePath, 0o644)
if err != nil {
return "", nil, fmt.Errorf("creating temp file: %w", err)
}
defer func() { err = aghrenameio.WithDeferredCleanup(err, fltFile) }()
buf := &bytes.Buffer{}
mw := io.MultiWriter(buf, fltFile)
parser := NewParser()
httpBody := ioutil.LimitReader(resp.Body, maxSize)
parseRes, err = parser.Parse(mw, httpBody, parseBuf)
if err != nil {
return "", nil, fmt.Errorf("parsing response from http url %q: %w", urlStr, err)
}
return buf.String(), parseRes, nil
}
// setName sets the title using either the already-present name, the given title
// from the rule-list data, or a synthetic name.
func (f *Filter) setName(title string) {
if f.name != "" {
return
}
if title != "" {
f.name = title
return
}
f.name = fmt.Sprintf("List %s", f.uid)
}
// setFromFile sets the rule-list filter's data from a file path. It also
// caches the data into a file.
//
// TODO(a.garipov): Retest on Windows once rule-list updater is committed. See
// if calling Close is necessary here.
func (f *Filter) setFromFile(
parseBuf []byte,
filePath string,
cachePath string,
) (parseRes *ParseResult, err error) {
defer func() { err = errors.Annotate(err, "setting from file: %w") }()
parseRes, err = parseIntoCache(parseBuf, filePath, cachePath)
if err != nil {
// Don't wrap the error, because it's informative enough as is.
return nil, err
}
err = f.Close()
if err != nil {
return nil, fmt.Errorf("closing old rule list: %w", err)
}
rl, err := filterlist.NewFileRuleList(f.urlFilterID, cachePath, true)
if err != nil {
return nil, fmt.Errorf("opening new rule list: %w", err)
}
f.ruleList = rl
return parseRes, nil
}
// parseIntoCache copies the relevant the data from filePath into cachePath
// while also parsing it.
func parseIntoCache(
parseBuf []byte,
filePath string,
cachePath string,
) (parseRes *ParseResult, err error) {
tmpFile, err := aghrenameio.NewPendingFile(cachePath, 0o644)
if err != nil {
return nil, fmt.Errorf("creating temp file: %w", err)
}
defer func() { err = aghrenameio.WithDeferredCleanup(err, tmpFile) }()
// #nosec G304 -- Assume that cachePath is always cacheDir joined with a
// uid using [filepath.Join].
f, err := os.Open(filePath)
if err != nil {
return nil, fmt.Errorf("opening src file: %w", err)
}
defer func() { err = errors.WithDeferred(err, f.Close()) }()
parser := NewParser()
parseRes, err = parser.Parse(tmpFile, f, parseBuf)
if err != nil {
return nil, fmt.Errorf("copying src file: %w", err)
}
return parseRes, nil
}
// Close closes the underlying rule list.
func (f *Filter) Close() (err error) {
if f.ruleList == nil {
return nil
}
return f.ruleList.Close()
}
// filterUpdate represents a single ongoing rule-list filter update.
//
//lint:ignore U1000 TODO(a.garipov): Use.
type filterUpdate struct {
httpCli *http.Client
cacheDir string
name string
parseBuf []byte
maxSize datasize.ByteSize
}
// process runs an update of a single rule-list.
func (u *filterUpdate) process(ctx context.Context, f *Filter) (err error) {
prevChecksum := f.checksum
parseRes, err := f.Refresh(ctx, u.parseBuf, u.httpCli, u.cacheDir, u.maxSize)
if err != nil {
return fmt.Errorf("updating %s: %w", f.uid, err)
}
if prevChecksum == parseRes.Checksum {
log.Info("filtering: filter %q: filter %q: no change", u.name, f.uid)
return nil
}
log.Info(
"filtering: updated filter %q: filter %q: %d bytes, %d rules",
u.name,
f.uid,
parseRes.BytesWritten,
parseRes.RulesCount,
)
return nil
}

View file

@ -0,0 +1,107 @@
package rulelist_test
import (
"context"
"io"
"net/http"
"net/http/httptest"
"net/url"
"os"
"path/filepath"
"testing"
"github.com/AdguardTeam/AdGuardHome/internal/filtering/rulelist"
"github.com/AdguardTeam/golibs/testutil"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestFilter_Refresh(t *testing.T) {
cacheDir := t.TempDir()
uid := rulelist.MustNewUID()
initialFile := filepath.Join(cacheDir, "initial.txt")
initialData := []byte(
testRuleTextTitle +
testRuleTextBlocked,
)
writeErr := os.WriteFile(initialFile, initialData, 0o644)
require.NoError(t, writeErr)
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
pt := testutil.PanicT{}
_, err := io.WriteString(w, testRuleTextTitle+testRuleTextBlocked)
require.NoError(pt, err)
}))
srvURL, urlErr := url.Parse(srv.URL)
require.NoError(t, urlErr)
testCases := []struct {
url *url.URL
name string
wantNewErrMsg string
}{{
url: nil,
name: "nil_url",
wantNewErrMsg: "no url",
}, {
url: &url.URL{
Scheme: "ftp",
},
name: "bad_scheme",
wantNewErrMsg: `bad url scheme: "ftp"`,
}, {
name: "file",
url: &url.URL{
Scheme: "file",
Path: initialFile,
},
wantNewErrMsg: "",
}, {
name: "http",
url: srvURL,
wantNewErrMsg: "",
}}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
f, err := rulelist.NewFilter(&rulelist.FilterConfig{
URL: tc.url,
Name: tc.name,
UID: uid,
URLFilterID: testURLFilterID,
Enabled: true,
})
if tc.wantNewErrMsg != "" {
assert.EqualError(t, err, tc.wantNewErrMsg)
return
}
testutil.CleanupAndRequireSuccess(t, f.Close)
require.NotNil(t, f)
ctx, cancel := context.WithTimeout(context.Background(), testTimeout)
t.Cleanup(cancel)
buf := make([]byte, rulelist.DefaultRuleBufSize)
cli := &http.Client{
Timeout: testTimeout,
}
res, err := f.Refresh(ctx, buf, cli, cacheDir, rulelist.DefaultMaxRuleListSize)
require.NoError(t, err)
assert.Equal(t, testTitle, res.Title)
assert.Equal(t, len(testRuleTextBlocked), res.BytesWritten)
assert.Equal(t, 1, res.RulesCount)
// Check that the cached file exists.
_, err = os.Stat(filepath.Join(cacheDir, uid.String()+".txt"))
require.NoError(t, err)
})
}
}

View file

@ -69,12 +69,12 @@ func TestParser_Parse(t *testing.T) {
wantWritten: len(testRuleTextBlocked) + len(testRuleTextHTML),
}, {
name: "title",
in: "! Title: Test Title \n" +
in: testRuleTextTitle +
"! Title: Bad, Ignored Title\n" +
testRuleTextBlocked,
wantDst: testRuleTextBlocked,
wantErrMsg: "",
wantTitle: "Test Title",
wantTitle: testTitle,
wantRulesNum: 1,
wantWritten: len(testRuleTextBlocked),
}, {
@ -87,14 +87,14 @@ func TestParser_Parse(t *testing.T) {
wantWritten: len(testRuleTextCosmetic),
}, {
name: "bad_char",
in: "! Title: Test Title \n" +
in: testRuleTextTitle +
testRuleTextBlocked +
">>>\x7F<<<",
wantDst: testRuleTextBlocked,
wantErrMsg: "line 3: " +
"character 4: " +
"likely binary character '\\x7f'",
wantTitle: "Test Title",
wantTitle: testTitle,
wantRulesNum: 1,
wantWritten: len(testRuleTextBlocked),
}, {

View file

@ -1,9 +1,55 @@
// Package rulelist contains the implementation of the standard rule-list
// filter that wraps an urlfilter filtering-engine.
//
// TODO(a.garipov): Expand.
// TODO(a.garipov): Add a new update worker.
package rulelist
import (
"fmt"
"github.com/c2h5oh/datasize"
"github.com/google/uuid"
)
// DefaultRuleBufSize is the default length of a buffer used to read a line with
// a filtering rule, in bytes.
//
// TODO(a.garipov): Consider using [datasize.ByteSize]. It is currently only
// used as an int.
const DefaultRuleBufSize = 1024
// DefaultMaxRuleListSize is the default maximum filtering-rule list size.
const DefaultMaxRuleListSize = 64 * datasize.MB
// URLFilterID is a semantic type-alias for IDs used for working with package
// urlfilter.
type URLFilterID = int
// UID is the type for the unique IDs of filtering-rule lists.
type UID uuid.UUID
// NewUID returns a new filtering-rule list UID. Any error returned is an error
// from the cryptographic randomness reader.
func NewUID() (uid UID, err error) {
uuidv7, err := uuid.NewV7()
return UID(uuidv7), err
}
// MustNewUID is a wrapper around [NewUID] that panics if there is an error.
func MustNewUID() (uid UID) {
uid, err := NewUID()
if err != nil {
panic(fmt.Errorf("unexpected uuidv7 error: %w", err))
}
return uid
}
// type check
var _ fmt.Stringer = UID{}
// String implements the [fmt.Stringer] interface for UID.
func (id UID) String() (s string) {
return uuid.UUID(id).String()
}

View file

@ -1,16 +1,34 @@
package rulelist_test
import "time"
import (
"testing"
"time"
"github.com/AdguardTeam/AdGuardHome/internal/filtering/rulelist"
"github.com/AdguardTeam/golibs/testutil"
)
func TestMain(m *testing.M) {
testutil.DiscardLogOutput(m)
}
// testTimeout is the common timeout for tests.
const testTimeout = 1 * time.Second
// Common texts for tests.
// testURLFilterID is the common [rulelist.URLFilterID] for tests.
const testURLFilterID rulelist.URLFilterID = 1
// testTitle is the common title for tests.
const testTitle = "Test Title"
// Common rule texts for tests.
const (
testRuleTextBadTab = "||bad-tab-and-comment.example^\t# A comment.\n"
testRuleTextBlocked = "||blocked.example^\n"
testRuleTextBlocked2 = "||blocked-2.example^\n"
testRuleTextEtcHostsTab = "0.0.0.0 tab..example^\t# A comment.\n"
testRuleTextHTML = "<!DOCTYPE html>\n"
testRuleTextTitle = "! Title: " + testTitle + " \n"
// testRuleTextCosmetic is a cosmetic rule with a zero-width non-joiner.
//