mirror of
https://github.com/superseriousbusiness/gotosocial.git
synced 2024-11-24 10:15:47 +03:00
[bugfix] Use better plaintext representation of status for filtering (#3301)
* [bugfix] Use better plaintext representation of status for filtering * add new deps to readme * lint * update tests * update regexes * address review comments * remove now unused xxhash * whoops, wrong logger * Merge branch 'main' into status_filtering_bugfix * put cache in caches struct * pain
This commit is contained in:
parent
6dd936fbe1
commit
efd1a4f717
15 changed files with 2685 additions and 64 deletions
|
@ -273,6 +273,7 @@ The following open source libraries, frameworks, and tools are used by GoToSocia
|
|||
- [jackc/pgconn](https://github.com/jackc/pgconn); Postgres driver. [MIT License](https://spdx.org/licenses/MIT.html).
|
||||
- [jackc/pgx](https://github.com/jackc/pgx); Postgres driver and toolkit. [MIT License](https://spdx.org/licenses/MIT.html).
|
||||
- [KimMachineGun/automemlimit](https://github.com/KimMachineGun/automemlimit); cgroups memory limit checking. [MIT License](https://spdx.org/licenses/MIT.html).
|
||||
- [k3a/html2text](https://github.com/k3a/html2text); HTML-to-text conversion. [MIT License](https://spdx.org/licenses/MIT.html).
|
||||
- [mcuadros/go-syslog](https://github.com/mcuadros/go-syslog); Syslog server library. [MIT License](https://spdx.org/licenses/MIT.html).
|
||||
- [microcosm-cc/bluemonday](https://github.com/microcosm-cc/bluemonday); HTML user-input sanitization. [BSD-3-Clause License](https://spdx.org/licenses/BSD-3-Clause.html).
|
||||
- [miekg/dns](https://github.com/miekg/dns); DNS utilities. [Go License](https://go.dev/LICENSE).
|
||||
|
|
1
go.mod
1
go.mod
|
@ -40,6 +40,7 @@ require (
|
|||
github.com/gorilla/feeds v1.2.0
|
||||
github.com/gorilla/websocket v1.5.2
|
||||
github.com/jackc/pgx/v5 v5.7.1
|
||||
github.com/k3a/html2text v1.2.1
|
||||
github.com/microcosm-cc/bluemonday v1.0.27
|
||||
github.com/miekg/dns v1.1.62
|
||||
github.com/minio/minio-go/v7 v7.0.76
|
||||
|
|
2
go.sum
2
go.sum
|
@ -384,6 +384,8 @@ github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/X
|
|||
github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo=
|
||||
github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
|
||||
github.com/k0kubun/colorstring v0.0.0-20150214042306-9440f1994b88/go.mod h1:3w7q1U84EfirKl04SVQ/s7nPm1ZPhiXd34z40TNz36k=
|
||||
github.com/k3a/html2text v1.2.1 h1:nvnKgBvBR/myqrwfLuiqecUtaK1lB9hGziIJKatNFVY=
|
||||
github.com/k3a/html2text v1.2.1/go.mod h1:ieEXykM67iT8lTvEWBh6fhpH4B23kB9OMKPdIBmgUqA=
|
||||
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
|
||||
github.com/klauspost/compress v1.10.4/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
|
||||
github.com/klauspost/compress v1.10.10/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
|
||||
|
|
20
internal/cache/cache.go
vendored
20
internal/cache/cache.go
vendored
|
@ -47,6 +47,11 @@ type Caches struct {
|
|||
// Webfinger provides access to the webfinger URL cache.
|
||||
Webfinger *ttl.Cache[string, string] // TTL=24hr, sweep=5min
|
||||
|
||||
// TTL cache of statuses -> filterable text fields.
|
||||
// To ensure up-to-date fields, cache is keyed as:
|
||||
// `[status.ID][status.UpdatedAt.Unix()]`
|
||||
StatusesFilterableFields *ttl.Cache[string, []string]
|
||||
|
||||
// prevent pass-by-value.
|
||||
_ nocopy
|
||||
}
|
||||
|
@ -109,6 +114,7 @@ func (c *Caches) Init() {
|
|||
c.initUserMuteIDs()
|
||||
c.initWebfinger()
|
||||
c.initVisibility()
|
||||
c.initStatusesFilterableFields()
|
||||
}
|
||||
|
||||
// Start will start any caches that require a background
|
||||
|
@ -119,6 +125,10 @@ func (c *Caches) Start() {
|
|||
tryUntil("starting webfinger cache", 5, func() bool {
|
||||
return c.Webfinger.Start(5 * time.Minute)
|
||||
})
|
||||
|
||||
tryUntil("starting statusesFilterableFields cache", 5, func() bool {
|
||||
return c.StatusesFilterableFields.Start(5 * time.Minute)
|
||||
})
|
||||
}
|
||||
|
||||
// Stop will stop any caches that require a background
|
||||
|
@ -127,6 +137,7 @@ func (c *Caches) Stop() {
|
|||
log.Infof(nil, "stop: %p", c)
|
||||
|
||||
tryUntil("stopping webfinger cache", 5, c.Webfinger.Stop)
|
||||
tryUntil("stopping statusesFilterableFields cache", 5, c.StatusesFilterableFields.Stop)
|
||||
}
|
||||
|
||||
// Sweep will sweep all the available caches to ensure none
|
||||
|
@ -204,3 +215,12 @@ func (c *Caches) initWebfinger() {
|
|||
24*time.Hour,
|
||||
)
|
||||
}
|
||||
|
||||
func (c *Caches) initStatusesFilterableFields() {
|
||||
c.StatusesFilterableFields = new(ttl.Cache[string, []string])
|
||||
c.StatusesFilterableFields.Init(
|
||||
0,
|
||||
512,
|
||||
1*time.Hour,
|
||||
)
|
||||
}
|
||||
|
|
|
@ -20,6 +20,8 @@ package gtsmodel
|
|||
import (
|
||||
"regexp"
|
||||
"time"
|
||||
|
||||
"github.com/superseriousbusiness/gotosocial/internal/util"
|
||||
)
|
||||
|
||||
// Filter stores a filter created by a local account.
|
||||
|
@ -61,14 +63,23 @@ type FilterKeyword struct {
|
|||
|
||||
// Compile will compile this FilterKeyword as a prepared regular expression.
|
||||
func (k *FilterKeyword) Compile() (err error) {
|
||||
var wordBreak string
|
||||
if k.WholeWord != nil && *k.WholeWord {
|
||||
wordBreak = `\b`
|
||||
var (
|
||||
wordBreakStart string
|
||||
wordBreakEnd string
|
||||
)
|
||||
|
||||
if util.PtrOrZero(k.WholeWord) {
|
||||
// Either word boundary or
|
||||
// whitespace or start of line.
|
||||
wordBreakStart = `(?:\b|\s|^)`
|
||||
// Either word boundary or
|
||||
// whitespace or end of line.
|
||||
wordBreakEnd = `(?:\b|\s|$)`
|
||||
}
|
||||
|
||||
// Compile keyword filter regexp.
|
||||
quoted := regexp.QuoteMeta(k.Keyword)
|
||||
k.Regexp, err = regexp.Compile(`(?i)` + wordBreak + quoted + wordBreak)
|
||||
k.Regexp, err = regexp.Compile(`(?i)` + wordBreakStart + quoted + wordBreakEnd)
|
||||
return // caller is expected to wrap this error
|
||||
}
|
||||
|
||||
|
|
|
@ -21,6 +21,8 @@ import (
|
|||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
|
@ -35,7 +37,6 @@ import (
|
|||
"github.com/superseriousbusiness/gotosocial/internal/language"
|
||||
"github.com/superseriousbusiness/gotosocial/internal/log"
|
||||
"github.com/superseriousbusiness/gotosocial/internal/media"
|
||||
"github.com/superseriousbusiness/gotosocial/internal/text"
|
||||
"github.com/superseriousbusiness/gotosocial/internal/uris"
|
||||
"github.com/superseriousbusiness/gotosocial/internal/util"
|
||||
)
|
||||
|
@ -939,32 +940,48 @@ func (c *Converter) statusToAPIFilterResults(
|
|||
return nil, nil
|
||||
}
|
||||
|
||||
// Extract text fields from the status that we will match filters against.
|
||||
fields := filterableTextFields(s)
|
||||
// Key this status based on ID + last updated time,
|
||||
// to ensure we always filter on latest version.
|
||||
statusKey := s.ID + strconv.FormatInt(s.UpdatedAt.Unix(), 10)
|
||||
|
||||
// Check if we have filterable fields cached for this status.
|
||||
cache := c.state.Caches.StatusesFilterableFields
|
||||
fields, stored := cache.Get(statusKey)
|
||||
if !stored {
|
||||
// We don't have filterable fields
|
||||
// cached, calculate + cache now.
|
||||
fields = filterableFields(s)
|
||||
cache.Set(statusKey, fields)
|
||||
}
|
||||
|
||||
// Record all matching warn filters and the reasons they matched.
|
||||
filterResults := make([]apimodel.FilterResult, 0, len(filters))
|
||||
for _, filter := range filters {
|
||||
if !filterAppliesInContext(filter, filterContext) {
|
||||
// Filter doesn't apply to this context.
|
||||
continue
|
||||
}
|
||||
if filter.Expired(now) {
|
||||
// Filter doesn't apply
|
||||
// to this context.
|
||||
continue
|
||||
}
|
||||
|
||||
// List all matching keywords.
|
||||
if filter.Expired(now) {
|
||||
// Filter doesn't
|
||||
// apply anymore.
|
||||
continue
|
||||
}
|
||||
|
||||
// Assemble matching keywords (if any) from this filter.
|
||||
keywordMatches := make([]string, 0, len(filter.Keywords))
|
||||
for _, filterKeyword := range filter.Keywords {
|
||||
var isMatch bool
|
||||
for _, field := range fields {
|
||||
if filterKeyword.Regexp.MatchString(field) {
|
||||
isMatch = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if isMatch {
|
||||
keywordMatches = append(keywordMatches, filterKeyword.Keyword)
|
||||
for _, keyword := range filter.Keywords {
|
||||
// Check if at least one filterable field
|
||||
// in the status matches on this filter.
|
||||
if slices.ContainsFunc(
|
||||
fields,
|
||||
func(field string) bool {
|
||||
return keyword.Regexp.MatchString(field)
|
||||
},
|
||||
) {
|
||||
// At least one field matched on this filter.
|
||||
keywordMatches = append(keywordMatches, keyword.Keyword)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1001,40 +1018,6 @@ func (c *Converter) statusToAPIFilterResults(
|
|||
return filterResults, nil
|
||||
}
|
||||
|
||||
// filterableTextFields returns all text from a status that we might want to filter on:
|
||||
// - content
|
||||
// - content warning
|
||||
// - media descriptions
|
||||
// - poll options
|
||||
func filterableTextFields(s *gtsmodel.Status) []string {
|
||||
fieldCount := 2 + len(s.Attachments)
|
||||
if s.Poll != nil {
|
||||
fieldCount += len(s.Poll.Options)
|
||||
}
|
||||
fields := make([]string, 0, fieldCount)
|
||||
|
||||
if s.Content != "" {
|
||||
fields = append(fields, text.SanitizeToPlaintext(s.Content))
|
||||
}
|
||||
if s.ContentWarning != "" {
|
||||
fields = append(fields, s.ContentWarning)
|
||||
}
|
||||
for _, attachment := range s.Attachments {
|
||||
if attachment.Description != "" {
|
||||
fields = append(fields, attachment.Description)
|
||||
}
|
||||
}
|
||||
if s.Poll != nil {
|
||||
for _, option := range s.Poll.Options {
|
||||
if option != "" {
|
||||
fields = append(fields, option)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return fields
|
||||
}
|
||||
|
||||
// filterAppliesInContext returns whether a given filter applies in a given context.
|
||||
func filterAppliesInContext(filter *gtsmodel.Filter, filterContext statusfilter.FilterContext) bool {
|
||||
switch filterContext {
|
||||
|
|
|
@ -1063,15 +1063,21 @@ func (suite *InternalToFrontendTestSuite) TestHideFilteredBoostToFrontend() {
|
|||
|
||||
// Test that a hashtag filter for a hashtag in Mastodon HTML content works the way most users would expect.
|
||||
func (suite *InternalToFrontendTestSuite) testHashtagFilteredStatusToFrontend(wholeWord bool, boost bool) {
|
||||
testStatus := suite.testStatuses["admin_account_status_1"]
|
||||
testStatus := new(gtsmodel.Status)
|
||||
*testStatus = *suite.testStatuses["admin_account_status_1"]
|
||||
testStatus.Content = `<p>doggo doggin' it</p><p><a href="https://example.test/tags/dogsofmastodon" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>dogsofmastodon</span></a></p>`
|
||||
|
||||
if boost {
|
||||
// Modify a fixture boost into a boost of the above status.
|
||||
boostStatus := suite.testStatuses["admin_account_status_4"]
|
||||
boostStatus.BoostOf = testStatus
|
||||
boostStatus.BoostOfID = testStatus.ID
|
||||
testStatus = boostStatus
|
||||
boost, err := suite.typeconverter.StatusToBoost(
|
||||
context.Background(),
|
||||
testStatus,
|
||||
suite.testAccounts["admin_account"],
|
||||
"",
|
||||
)
|
||||
if err != nil {
|
||||
suite.FailNow(err.Error())
|
||||
}
|
||||
testStatus = boost
|
||||
}
|
||||
|
||||
requestingAccount := suite.testAccounts["local_account_1"]
|
||||
|
@ -1103,9 +1109,11 @@ func (suite *InternalToFrontendTestSuite) testHashtagFilteredStatusToFrontend(wh
|
|||
[]*gtsmodel.Filter{filter},
|
||||
nil,
|
||||
)
|
||||
if suite.NoError(err) {
|
||||
suite.NotEmpty(apiStatus.Filtered)
|
||||
if err != nil {
|
||||
suite.FailNow(err.Error())
|
||||
}
|
||||
|
||||
suite.NotEmpty(apiStatus.Filtered)
|
||||
}
|
||||
|
||||
func (suite *InternalToFrontendTestSuite) TestHashtagWholeWordFilteredStatusToFrontend() {
|
||||
|
|
|
@ -27,6 +27,7 @@ import (
|
|||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/k3a/html2text"
|
||||
apimodel "github.com/superseriousbusiness/gotosocial/internal/api/model"
|
||||
"github.com/superseriousbusiness/gotosocial/internal/config"
|
||||
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
|
||||
|
@ -284,3 +285,64 @@ func ContentToContentLanguage(
|
|||
|
||||
return contentStr, langTagStr
|
||||
}
|
||||
|
||||
// filterableFields returns text fields from
|
||||
// a status that we might want to filter on:
|
||||
//
|
||||
// - content warning
|
||||
// - content (converted to plaintext from HTML)
|
||||
// - media descriptions
|
||||
// - poll options
|
||||
//
|
||||
// Each field should be filtered separately.
|
||||
// This avoids scenarios where false-positive
|
||||
// multiple-word matches can be made by matching
|
||||
// the last word of one field + the first word
|
||||
// of the next field together.
|
||||
func filterableFields(s *gtsmodel.Status) []string {
|
||||
// Estimate length of fields.
|
||||
fieldCount := 2 + len(s.Attachments)
|
||||
if s.Poll != nil {
|
||||
fieldCount += len(s.Poll.Options)
|
||||
}
|
||||
fields := make([]string, 0, fieldCount)
|
||||
|
||||
// Content warning / title.
|
||||
if s.ContentWarning != "" {
|
||||
fields = append(fields, s.ContentWarning)
|
||||
}
|
||||
|
||||
// Status content. Though we have raw text
|
||||
// available for statuses created on our
|
||||
// instance, use the html2text version to
|
||||
// remove markdown-formatting characters
|
||||
// and ensure more consistent filtering.
|
||||
if s.Content != "" {
|
||||
text := html2text.HTML2TextWithOptions(
|
||||
s.Content,
|
||||
html2text.WithLinksInnerText(),
|
||||
html2text.WithUnixLineBreaks(),
|
||||
)
|
||||
if text != "" {
|
||||
fields = append(fields, text)
|
||||
}
|
||||
}
|
||||
|
||||
// Media descriptions.
|
||||
for _, attachment := range s.Attachments {
|
||||
if attachment.Description != "" {
|
||||
fields = append(fields, attachment.Description)
|
||||
}
|
||||
}
|
||||
|
||||
// Poll options.
|
||||
if s.Poll != nil {
|
||||
for _, opt := range s.Poll.Options {
|
||||
if opt != "" {
|
||||
fields = append(fields, opt)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return fields
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@ import (
|
|||
"context"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/superseriousbusiness/gotosocial/internal/config"
|
||||
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
|
||||
"github.com/superseriousbusiness/gotosocial/internal/language"
|
||||
|
@ -158,3 +159,62 @@ func TestContentToContentLanguage(t *testing.T) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestFilterableText(t *testing.T) {
|
||||
type testcase struct {
|
||||
status *gtsmodel.Status
|
||||
expectedFields []string
|
||||
}
|
||||
|
||||
for _, testcase := range []testcase{
|
||||
{
|
||||
status: >smodel.Status{
|
||||
ContentWarning: "This is a test status",
|
||||
Content: `<p>Import / export of account data via CSV files will be coming in 0.17.0 :) No more having to run scripts + CLI tools to import a list of accounts you follow, after doing a migration to a <a href="https://gts.superseriousbusiness.org/tags/gotosocial" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>GoToSocial</span></a> instance.</p>`,
|
||||
},
|
||||
expectedFields: []string{
|
||||
"This is a test status",
|
||||
"Import / export of account data via CSV files will be coming in 0.17.0 :) No more having to run scripts + CLI tools to import a list of accounts you follow, after doing a migration to a #GoToSocial <https://gts.superseriousbusiness.org/tags/gotosocial> instance.",
|
||||
},
|
||||
},
|
||||
{
|
||||
status: >smodel.Status{
|
||||
Content: `<p><span class="h-card"><a href="https://example.org/@zlatko" class="u-url mention" rel="nofollow noreferrer noopener" target="_blank">@<span>zlatko</span></a></span> currently we used modernc/sqlite3 for our sqlite driver, but we've been experimenting with wasm sqlite, and will likely move to that permanently in future; in the meantime, both options are available (the latter with a build tag)</p><p><a href="https://github.com/superseriousbusiness/gotosocial/pull/2863" rel="nofollow noreferrer noopener" target="_blank">https://github.com/superseriousbusiness/gotosocial/pull/2863</a></p>`,
|
||||
},
|
||||
expectedFields: []string{
|
||||
"@zlatko <https://example.org/@zlatko> currently we used modernc/sqlite3 for our sqlite driver, but we've been experimenting with wasm sqlite, and will likely move to that permanently in future; in the meantime, both options are available (the latter with a build tag)\n\nhttps://github.com/superseriousbusiness/gotosocial/pull/2863 <https://github.com/superseriousbusiness/gotosocial/pull/2863>",
|
||||
},
|
||||
},
|
||||
{
|
||||
status: >smodel.Status{
|
||||
ContentWarning: "Nerd stuff",
|
||||
Content: `<p>Latest graphs for <a href="https://gts.superseriousbusiness.org/tags/gotosocial" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>GoToSocial</span></a> on <a href="https://github.com/ncruces/go-sqlite3" rel="nofollow noreferrer noopener" target="_blank">Wasm sqlite3</a> with <a href="https://codeberg.org/gruf/go-ffmpreg" rel="nofollow noreferrer noopener" target="_blank">embedded Wasm ffmpeg</a>, both running on <a href="https://wazero.io/" rel="nofollow noreferrer noopener" target="_blank">Wazero</a>, and configured with a <a href="https://github.com/superseriousbusiness/gotosocial/blob/20fe430ef9ff3012a7a4dc2d01b68020c20e13bb/example/config.yaml#L259-L266" rel="nofollow noreferrer noopener" target="_blank">50MiB db cache target</a>. This is the version we'll be releasing soonish, now we're happy with how we've tamed everything.</p>`,
|
||||
Attachments: []*gtsmodel.MediaAttachment{
|
||||
{
|
||||
Description: `Graph showing GtS using between 150-300 MiB of memory, steadily, over a few days.`,
|
||||
},
|
||||
{
|
||||
Description: `Another media attachment`,
|
||||
},
|
||||
},
|
||||
Poll: >smodel.Poll{
|
||||
Options: []string{
|
||||
"Poll option 1",
|
||||
"Poll option 2",
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedFields: []string{
|
||||
"Nerd stuff",
|
||||
"Latest graphs for #GoToSocial <https://gts.superseriousbusiness.org/tags/gotosocial> on Wasm sqlite3 <https://github.com/ncruces/go-sqlite3> with embedded Wasm ffmpeg <https://codeberg.org/gruf/go-ffmpreg>, both running on Wazero <https://wazero.io/>, and configured with a 50MiB db cache target <https://github.com/superseriousbusiness/gotosocial/blob/20fe430ef9ff3012a7a4dc2d01b68020c20e13bb/example/config.yaml#L259-L266>. This is the version we'll be releasing soonish, now we're happy with how we've tamed everything.",
|
||||
"Graph showing GtS using between 150-300 MiB of memory, steadily, over a few days.",
|
||||
"Another media attachment",
|
||||
"Poll option 1",
|
||||
"Poll option 2",
|
||||
},
|
||||
},
|
||||
} {
|
||||
fields := filterableFields(testcase.status)
|
||||
assert.Equal(t, testcase.expectedFields, fields)
|
||||
}
|
||||
}
|
||||
|
|
10
vendor/github.com/k3a/html2text/.travis.yml
generated
vendored
Normal file
10
vendor/github.com/k3a/html2text/.travis.yml
generated
vendored
Normal file
|
@ -0,0 +1,10 @@
|
|||
language: go
|
||||
go:
|
||||
- master
|
||||
before_install:
|
||||
- go get github.com/axw/gocov/gocov
|
||||
- go get github.com/mattn/goveralls
|
||||
- if ! go get github.com/golang/tools/cmd/cover; then go get golang.org/x/tools/cmd/cover; fi
|
||||
script:
|
||||
- $HOME/gopath/bin/goveralls -service=travis-ci
|
||||
|
21
vendor/github.com/k3a/html2text/LICENSE
generated
vendored
Normal file
21
vendor/github.com/k3a/html2text/LICENSE
generated
vendored
Normal file
|
@ -0,0 +1,21 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2017 Mario K3A Hros (www.k3a.me)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
60
vendor/github.com/k3a/html2text/README.md
generated
vendored
Normal file
60
vendor/github.com/k3a/html2text/README.md
generated
vendored
Normal file
|
@ -0,0 +1,60 @@
|
|||
[![GoDoc](https://godoc.org/github.com/k3a/html2text?status.svg)](https://godoc.org/github.com/k3a/html2text)
|
||||
[![Build Status](https://travis-ci.org/k3a/html2text.svg?branch=master)](https://travis-ci.org/k3a/html2text)
|
||||
[![Coverage Status](https://coveralls.io/repos/github/k3a/html2text/badge.svg?branch=master)](https://coveralls.io/github/k3a/html2text?branch=master)
|
||||
[![Report Card](https://goreportcard.com/badge/github.com/k3a/html2text)](https://goreportcard.com/report/github.com/k3a/html2text)
|
||||
|
||||
# html2text
|
||||
|
||||
A simple Golang package to convert HTML to plain text (without non-standard dependencies).
|
||||
|
||||
It converts HTML tags to text and also parses HTML entities into characters they represent.
|
||||
A `<head>` section of the HTML document, as well as most other tags are stripped out but
|
||||
links are properly converted into their href attribute.
|
||||
|
||||
It can be used for converting HTML emails into text.
|
||||
|
||||
Some tests are installed as well.
|
||||
Uses semantic versioning and no breaking changes are planned.
|
||||
|
||||
Fell free to publish a pull request if you have suggestions for improvement but please note that the library can now be considered feature-complete and API stable. If you need more than this basic conversion, please use an alternative mentioned at the bottom.
|
||||
|
||||
## Install
|
||||
```bash
|
||||
go get github.com/k3a/html2text
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
```go
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/k3a/html2text"
|
||||
)
|
||||
|
||||
func main() {
|
||||
html := `<html><head><title>Good</title></head><body><strong>clean</strong> text</body>`
|
||||
|
||||
plain := html2text.HTML2Text(html)
|
||||
|
||||
fmt.Println(plain)
|
||||
}
|
||||
|
||||
/* Outputs:
|
||||
|
||||
clean text
|
||||
*/
|
||||
|
||||
```
|
||||
|
||||
To see all features, please look info `html2text_test.go`.
|
||||
|
||||
## Alternatives
|
||||
- https://github.com/jaytaylor/html2text (heavier, with more features)
|
||||
- https://git.alexwennerberg.com/nanohtml2text (rewrite of this module in Rust)
|
||||
|
||||
## License
|
||||
|
||||
MIT
|
||||
|
2046
vendor/github.com/k3a/html2text/entity.go
generated
vendored
Normal file
2046
vendor/github.com/k3a/html2text/entity.go
generated
vendored
Normal file
File diff suppressed because it is too large
Load diff
333
vendor/github.com/k3a/html2text/html2text.go
generated
vendored
Normal file
333
vendor/github.com/k3a/html2text/html2text.go
generated
vendored
Normal file
|
@ -0,0 +1,333 @@
|
|||
package html2text
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Line break constants
|
||||
// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
|
||||
const (
|
||||
WIN_LBR = "\r\n"
|
||||
UNIX_LBR = "\n"
|
||||
)
|
||||
|
||||
var legacyLBR = WIN_LBR
|
||||
var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s+)`)
|
||||
var linkTagRE = regexp.MustCompile(`^(?i:a)(?:$|\s).*(?i:href)\s*=\s*('([^']*?)'|"([^"]*?)"|([^\s"'` + "`" + `=<>]+))`)
|
||||
var badLinkHrefRE = regexp.MustCompile(`javascript:`)
|
||||
var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`)
|
||||
var numericEntityRE = regexp.MustCompile(`(?i)^#(x?[a-f0-9]+)$`)
|
||||
|
||||
type options struct {
|
||||
lbr string
|
||||
linksInnerText bool
|
||||
listPrefix string
|
||||
}
|
||||
|
||||
func newOptions() *options {
|
||||
// apply defaults
|
||||
return &options{
|
||||
lbr: WIN_LBR,
|
||||
}
|
||||
}
|
||||
|
||||
// Option is a functional option
|
||||
type Option func(*options)
|
||||
|
||||
// WithUnixLineBreaks instructs the converter to use unix line breaks ("\n" instead of "\r\n" default)
|
||||
func WithUnixLineBreaks() Option {
|
||||
return func(o *options) {
|
||||
o.lbr = UNIX_LBR
|
||||
}
|
||||
}
|
||||
|
||||
// WithLinksInnerText instructs the converter to retain link tag inner text and append href URLs in angle brackets after the text
|
||||
// Example: click news <http://bit.ly/2n4wXRs>
|
||||
func WithLinksInnerText() Option {
|
||||
return func(o *options) {
|
||||
o.linksInnerText = true
|
||||
}
|
||||
}
|
||||
|
||||
// WithListSupportPrefix formats <ul> and <li> lists with the specified prefix
|
||||
func WithListSupportPrefix(prefix string) Option {
|
||||
return func(o *options) {
|
||||
o.listPrefix = prefix
|
||||
}
|
||||
}
|
||||
|
||||
// WithListSupport formats <ul> and <li> lists with " - " prefix
|
||||
func WithListSupport() Option {
|
||||
return WithListSupportPrefix(" - ")
|
||||
}
|
||||
|
||||
func parseHTMLEntity(entName string) (string, bool) {
|
||||
if r, ok := entity[entName]; ok {
|
||||
return string(r), true
|
||||
}
|
||||
|
||||
if match := numericEntityRE.FindStringSubmatch(entName); len(match) == 2 {
|
||||
var (
|
||||
err error
|
||||
n int64
|
||||
digits = match[1]
|
||||
)
|
||||
|
||||
if digits != "" && (digits[0] == 'x' || digits[0] == 'X') {
|
||||
n, err = strconv.ParseInt(digits[1:], 16, 64)
|
||||
} else {
|
||||
n, err = strconv.ParseInt(digits, 10, 64)
|
||||
}
|
||||
|
||||
if err == nil && (n == 9 || n == 10 || n == 13 || n > 31) {
|
||||
return string(rune(n)), true
|
||||
}
|
||||
}
|
||||
|
||||
return "", false
|
||||
}
|
||||
|
||||
// SetUnixLbr with argument true sets Unix-style line-breaks in output ("\n")
|
||||
// with argument false sets Windows-style line-breaks in output ("\r\n", the default)
|
||||
// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
|
||||
func SetUnixLbr(b bool) {
|
||||
if b {
|
||||
legacyLBR = UNIX_LBR
|
||||
} else {
|
||||
legacyLBR = WIN_LBR
|
||||
}
|
||||
}
|
||||
|
||||
// HTMLEntitiesToText decodes HTML entities inside a provided
|
||||
// string and returns decoded text
|
||||
func HTMLEntitiesToText(htmlEntsText string) string {
|
||||
outBuf := bytes.NewBufferString("")
|
||||
inEnt := false
|
||||
|
||||
for i, r := range htmlEntsText {
|
||||
switch {
|
||||
case r == ';' && inEnt:
|
||||
inEnt = false
|
||||
continue
|
||||
|
||||
case r == '&': //possible html entity
|
||||
entName := ""
|
||||
isEnt := false
|
||||
|
||||
// parse the entity name - max 10 chars
|
||||
chars := 0
|
||||
for _, er := range htmlEntsText[i+1:] {
|
||||
if er == ';' {
|
||||
isEnt = true
|
||||
break
|
||||
} else {
|
||||
entName += string(er)
|
||||
}
|
||||
|
||||
chars++
|
||||
if chars == 10 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if isEnt {
|
||||
if ent, isEnt := parseHTMLEntity(entName); isEnt {
|
||||
outBuf.WriteString(ent)
|
||||
inEnt = true
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !inEnt {
|
||||
outBuf.WriteRune(r)
|
||||
}
|
||||
}
|
||||
|
||||
return outBuf.String()
|
||||
}
|
||||
|
||||
func writeSpace(outBuf *bytes.Buffer) {
|
||||
bts := outBuf.Bytes()
|
||||
if len(bts) > 0 && bts[len(bts)-1] != ' ' {
|
||||
outBuf.WriteString(" ")
|
||||
}
|
||||
}
|
||||
|
||||
// HTML2Text converts html into a text form
|
||||
func HTML2Text(html string) string {
|
||||
var opts []Option
|
||||
if legacyLBR == UNIX_LBR {
|
||||
opts = append(opts, WithUnixLineBreaks())
|
||||
}
|
||||
return HTML2TextWithOptions(html, opts...)
|
||||
}
|
||||
|
||||
// HTML2TextWithOptions converts html into a text form with additional options
|
||||
func HTML2TextWithOptions(html string, reqOpts ...Option) string {
|
||||
opts := newOptions()
|
||||
for _, opt := range reqOpts {
|
||||
opt(opts)
|
||||
}
|
||||
|
||||
inLen := len(html)
|
||||
tagStart := 0
|
||||
inEnt := false
|
||||
badTagStackDepth := 0 // if == 1 it means we are inside <head>...</head>
|
||||
shouldOutput := true
|
||||
// maintain a stack of <a> tag href links and output it after the tag's inner text (for opts.linksInnerText only)
|
||||
hrefs := []string{}
|
||||
// new line cannot be printed at the beginning or
|
||||
// for <p> after a new line created by previous <p></p>
|
||||
canPrintNewline := false
|
||||
|
||||
outBuf := bytes.NewBufferString("")
|
||||
|
||||
for i, r := range html {
|
||||
if inLen > 0 && i == inLen-1 {
|
||||
// prevent new line at the end of the document
|
||||
canPrintNewline = false
|
||||
}
|
||||
|
||||
switch {
|
||||
// skip new lines and spaces adding a single space if not there yet
|
||||
case r <= 0xD, r == 0x85, r == 0x2028, r == 0x2029, // new lines
|
||||
r == ' ', r >= 0x2008 && r <= 0x200B: // spaces
|
||||
if shouldOutput && badTagStackDepth == 0 && !inEnt {
|
||||
//outBuf.WriteString(fmt.Sprintf("{DBG r:%c, inEnt:%t, tag:%s}", r, inEnt, html[tagStart:i]))
|
||||
writeSpace(outBuf)
|
||||
}
|
||||
continue
|
||||
|
||||
case r == ';' && inEnt: // end of html entity
|
||||
inEnt = false
|
||||
continue
|
||||
|
||||
case r == '&' && shouldOutput: // possible html entity
|
||||
entName := ""
|
||||
isEnt := false
|
||||
|
||||
// parse the entity name - max 10 chars
|
||||
chars := 0
|
||||
for _, er := range html[i+1:] {
|
||||
if er == ';' {
|
||||
isEnt = true
|
||||
break
|
||||
} else {
|
||||
entName += string(er)
|
||||
}
|
||||
|
||||
chars++
|
||||
if chars == 10 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if isEnt {
|
||||
if ent, isEnt := parseHTMLEntity(entName); isEnt {
|
||||
outBuf.WriteString(ent)
|
||||
inEnt = true
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
case r == '<': // start of a tag
|
||||
tagStart = i + 1
|
||||
shouldOutput = false
|
||||
continue
|
||||
|
||||
case r == '>': // end of a tag
|
||||
shouldOutput = true
|
||||
tag := html[tagStart:i]
|
||||
tagNameLowercase := strings.ToLower(tag)
|
||||
|
||||
if tagNameLowercase == "/ul" || tagNameLowercase == "/ol" {
|
||||
outBuf.WriteString(opts.lbr)
|
||||
} else if tagNameLowercase == "li" || tagNameLowercase == "li/" {
|
||||
if opts.listPrefix != "" {
|
||||
outBuf.WriteString(opts.lbr + opts.listPrefix)
|
||||
} else {
|
||||
outBuf.WriteString(opts.lbr)
|
||||
}
|
||||
} else if headersRE.MatchString(tagNameLowercase) {
|
||||
if canPrintNewline {
|
||||
outBuf.WriteString(opts.lbr + opts.lbr)
|
||||
}
|
||||
canPrintNewline = false
|
||||
} else if tagNameLowercase == "br" || tagNameLowercase == "br/" {
|
||||
// new line
|
||||
outBuf.WriteString(opts.lbr)
|
||||
} else if tagNameLowercase == "p" || tagNameLowercase == "/p" {
|
||||
if canPrintNewline {
|
||||
outBuf.WriteString(opts.lbr + opts.lbr)
|
||||
}
|
||||
canPrintNewline = false
|
||||
} else if opts.linksInnerText && tagNameLowercase == "/a" {
|
||||
// end of link
|
||||
// links can be empty can happen if the link matches the badLinkHrefRE
|
||||
if len(hrefs) > 0 {
|
||||
outBuf.WriteString(" <")
|
||||
outBuf.WriteString(HTMLEntitiesToText(hrefs[0]))
|
||||
outBuf.WriteString(">")
|
||||
hrefs = hrefs[1:]
|
||||
}
|
||||
} else if opts.linksInnerText && linkTagRE.MatchString(tagNameLowercase) {
|
||||
// parse link href
|
||||
// add special handling for a tags
|
||||
m := linkTagRE.FindStringSubmatch(tag)
|
||||
if len(m) == 5 {
|
||||
link := m[2]
|
||||
if len(link) == 0 {
|
||||
link = m[3]
|
||||
if len(link) == 0 {
|
||||
link = m[4]
|
||||
}
|
||||
}
|
||||
|
||||
if opts.linksInnerText && !badLinkHrefRE.MatchString(link) {
|
||||
hrefs = append(hrefs, link)
|
||||
}
|
||||
}
|
||||
} else if badTagnamesRE.MatchString(tagNameLowercase) {
|
||||
// unwanted block
|
||||
badTagStackDepth++
|
||||
|
||||
// if link inner text preservation is not enabled
|
||||
// and the current tag is a link tag, parse its href and output that
|
||||
if !opts.linksInnerText {
|
||||
// parse link href
|
||||
m := linkTagRE.FindStringSubmatch(tag)
|
||||
if len(m) == 5 {
|
||||
link := m[2]
|
||||
if len(link) == 0 {
|
||||
link = m[3]
|
||||
if len(link) == 0 {
|
||||
link = m[4]
|
||||
}
|
||||
}
|
||||
|
||||
if !badLinkHrefRE.MatchString(link) {
|
||||
outBuf.WriteString(HTMLEntitiesToText(link))
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if len(tagNameLowercase) > 0 && tagNameLowercase[0] == '/' &&
|
||||
badTagnamesRE.MatchString(tagNameLowercase[1:]) {
|
||||
// end of unwanted block
|
||||
badTagStackDepth--
|
||||
}
|
||||
continue
|
||||
|
||||
} // switch end
|
||||
|
||||
if shouldOutput && badTagStackDepth == 0 && !inEnt {
|
||||
canPrintNewline = true
|
||||
outBuf.WriteRune(r)
|
||||
}
|
||||
}
|
||||
|
||||
return outBuf.String()
|
||||
}
|
3
vendor/modules.txt
vendored
3
vendor/modules.txt
vendored
|
@ -446,6 +446,9 @@ github.com/josharian/intern
|
|||
# github.com/json-iterator/go v1.1.12
|
||||
## explicit; go 1.12
|
||||
github.com/json-iterator/go
|
||||
# github.com/k3a/html2text v1.2.1
|
||||
## explicit; go 1.16
|
||||
github.com/k3a/html2text
|
||||
# github.com/klauspost/compress v1.17.9
|
||||
## explicit; go 1.20
|
||||
github.com/klauspost/compress
|
||||
|
|
Loading…
Reference in a new issue