From 1e57cff3e07b854a085f2a10fa2f972a8f960bfb Mon Sep 17 00:00:00 2001 From: Gabe Kangas Date: Wed, 16 Aug 2023 18:19:09 -0700 Subject: [PATCH] fix: use lightweight bot/scraper html responses. Fixes #3253 --- controllers/index.go | 88 +++++++++++++++++++ go.mod | 1 + go.sum | 2 + static/metadata.html.tmpl | 83 +++++++++++++++++ static/static.go | 11 +++ .../browser/bot-share-search-scrapers.test.js | 48 ++++++++++ utils/utils.go | 29 ++++++ 7 files changed, 262 insertions(+) create mode 100644 static/metadata.html.tmpl create mode 100644 test/automated/browser/bot-share-search-scrapers.test.js diff --git a/controllers/index.go b/controllers/index.go index 18f44674d..9ae240abd 100644 --- a/controllers/index.go +++ b/controllers/index.go @@ -4,13 +4,18 @@ import ( "encoding/json" "fmt" "net/http" + "net/url" "path/filepath" "strings" + "github.com/owncast/owncast/config" + "github.com/owncast/owncast/core" "github.com/owncast/owncast/core/data" + "github.com/owncast/owncast/models" "github.com/owncast/owncast/router/middleware" "github.com/owncast/owncast/static" "github.com/owncast/owncast/utils" + log "github.com/sirupsen/logrus" ) // IndexHandler handles the default index route. @@ -24,6 +29,13 @@ func IndexHandler(w http.ResponseWriter, r *http.Request) { return } + // For search engine bots and social scrapers return a special + // server-rendered page. + if utils.IsUserAgentABot(r.UserAgent()) && isIndexRequest { + handleScraperMetadataPage(w, r) + return + } + // Set a cache control max-age header middleware.SetCachingHeaders(w, r) @@ -93,3 +105,79 @@ func renderIndexHtml(w http.ResponseWriter, nonce string) { http.Error(w, err.Error(), http.StatusInternalServerError) } } + +// MetadataPage represents a server-rendered web page for bots and web scrapers. +type MetadataPage struct { + RequestedURL string + Image string + Thumbnail string + TagsString string + Summary string + Name string + Tags []string + SocialHandles []models.SocialHandle +} + +// Return a basic HTML page with server-rendered metadata from the config +// to give to Opengraph clients and web scrapers (bots, web crawlers, etc). +func handleScraperMetadataPage(w http.ResponseWriter, r *http.Request) { + tmpl, err := static.GetBotMetadataTemplate() + if err != nil { + log.Errorln(err) + w.WriteHeader(http.StatusInternalServerError) + return + } + + scheme := "http" + + if siteURL := data.GetServerURL(); siteURL != "" { + if parsed, err := url.Parse(siteURL); err == nil && parsed.Scheme != "" { + scheme = parsed.Scheme + } + } + + fullURL, err := url.Parse(fmt.Sprintf("%s://%s%s", scheme, r.Host, r.URL.Path)) + if err != nil { + log.Errorln(err) + } + imageURL, err := url.Parse(fmt.Sprintf("%s://%s%s", scheme, r.Host, "/logo/external")) + if err != nil { + log.Errorln(err) + } + + status := core.GetStatus() + + // If the thumbnail does not exist or we're offline then just use the logo image + var thumbnailURL string + if status.Online && utils.DoesFileExists(filepath.Join(config.DataDirectory, "tmp", "thumbnail.jpg")) { + thumbnail, err := url.Parse(fmt.Sprintf("%s://%s%s", scheme, r.Host, "/thumbnail.jpg")) + if err != nil { + log.Errorln(err) + thumbnailURL = imageURL.String() + } else { + thumbnailURL = thumbnail.String() + } + } else { + thumbnailURL = imageURL.String() + } + + tagsString := strings.Join(data.GetServerMetadataTags(), ",") + metadata := MetadataPage{ + Name: data.GetServerName(), + RequestedURL: fullURL.String(), + Image: imageURL.String(), + Summary: data.GetServerSummary(), + Thumbnail: thumbnailURL, + TagsString: tagsString, + Tags: data.GetServerMetadataTags(), + SocialHandles: data.GetSocialHandles(), + } + + // Set a cache header + middleware.SetCachingHeaders(w, r) + + w.Header().Set("Content-Type", "text/html") + if err := tmpl.Execute(w, metadata); err != nil { + log.Errorln(err) + } +} diff --git a/go.mod b/go.mod index 683223509..038588fe3 100644 --- a/go.mod +++ b/go.mod @@ -67,6 +67,7 @@ require ( github.com/golang-jwt/jwt v3.2.2+incompatible // indirect github.com/gorilla/css v1.0.0 // indirect github.com/jmespath/go-jmespath v0.4.0 // indirect + github.com/mssola/user_agent v0.6.0 // indirect github.com/oschwald/maxminddb-golang v1.11.0 // indirect github.com/shoenig/go-m1cpu v0.1.6 // indirect ) diff --git a/go.sum b/go.sum index acb7bcaa7..b4b2720cf 100644 --- a/go.sum +++ b/go.sum @@ -85,6 +85,8 @@ github.com/microcosm-cc/bluemonday v1.0.25 h1:4NEwSfiJ+Wva0VxN5B8OwMicaJvD8r9tlJ github.com/microcosm-cc/bluemonday v1.0.25/go.mod h1:ZIOjCQp1OrzBBPIJmfX4qDYFuhU02nx4bn030ixfHLE= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/mssola/user_agent v0.6.0 h1:uwPR4rtWlCHRFyyP9u2KOV0u8iQXmS7Z7feTrstQwk4= +github.com/mssola/user_agent v0.6.0/go.mod h1:TTPno8LPY3wAIEKRpAtkdMT0f8SE24pLRGPahjCH4uw= github.com/mvdan/xurls v1.1.0 h1:OpuDelGQ1R1ueQ6sSryzi6P+1RtBpfQHM8fJwlE45ww= github.com/mvdan/xurls v1.1.0/go.mod h1:tQlNn3BED8bE/15hnSL2HLkDeLWpNPAwtw7wkEq44oU= github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= diff --git a/static/metadata.html.tmpl b/static/metadata.html.tmpl new file mode 100644 index 000000000..fb1ab7878 --- /dev/null +++ b/static/metadata.html.tmpl @@ -0,0 +1,83 @@ + + + + + + + {{.Name}} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

{{.Name}}

+ +
+ +
+ +

{{.Summary}}

+ + {{range .Tags}} +
  • {{.}}
  • + {{end}} + +
    + +

    Links for {{.Name}}:

    + + {{range .SocialHandles}} +
  • {{.Platform}}
  • + {{end}} + + + + diff --git a/static/static.go b/static/static.go index ab3f81786..6a115acda 100644 --- a/static/static.go +++ b/static/static.go @@ -76,3 +76,14 @@ func getFileSystemStaticFileOrDefault(path string, defaultData []byte) []byte { return data } + +//go:embed metadata.html.tmpl +var botMetadataTemplate embed.FS + +// GetBotMetadataTemplate will return the bot/scraper metadata template. +func GetBotMetadataTemplate() (*template.Template, error) { + name := "metadata.html.tmpl" + t, err := template.ParseFS(botMetadataTemplate, name) + tmpl := template.Must(t, err) + return tmpl, err +} diff --git a/test/automated/browser/bot-share-search-scrapers.test.js b/test/automated/browser/bot-share-search-scrapers.test.js new file mode 100644 index 000000000..e25b99e8a --- /dev/null +++ b/test/automated/browser/bot-share-search-scrapers.test.js @@ -0,0 +1,48 @@ +const listenForErrors = require('./lib/errors.js').listenForErrors; + +describe('Video embed page', () => { + + async function getMetaTagContent(property) { + const selector = `meta[property="${property}"]`; + + const tag = await page.evaluate((selector) => { + return document.head.querySelector(selector).getAttribute("content"); + }, selector); + return tag; + } + + beforeAll(async () => { + await page.setViewport({ width: 1080, height: 720 }); + listenForErrors(browser, page); + page.setUserAgent( + "Mastodon" + ); + await page.goto('http://localhost:5309'); + }); + + afterAll(async () => { + await page.waitForTimeout(3000); + await page.screenshot({ path: 'screenshots/screenshot_bots_share_search_scrapers.png', fullPage: true }); + }); + + it('should have rendered the simple bot accessible html page', async () => { + await page.waitForSelector('h1'); + await page.waitForSelector('h3'); + + const ogVideo = await getMetaTagContent('og:video'); + expect(ogVideo).toBe('http://localhost:5309/embed/video'); + + const ogVideoType = await getMetaTagContent('og:video:type'); + expect(ogVideoType).toBe('text/html'); + + // When stream is live the thumbnail is provided as the image. + const ogImage = await getMetaTagContent('og:image'); + expect(ogImage).toBe('http://localhost:5309/thumbnail.jpg'); + + const twitterUrl = await getMetaTagContent('twitter:url'); + expect(twitterUrl).toBe('http://localhost:5309/'); + + const twitterImage = await getMetaTagContent('twitter:image'); + expect(twitterImage).toBe('http://localhost:5309/logo/external'); + }); +}); diff --git a/utils/utils.go b/utils/utils.go index df37e6a95..b6f1b79ac 100644 --- a/utils/utils.go +++ b/utils/utils.go @@ -16,6 +16,7 @@ import ( "strings" "time" + "github.com/mssola/user_agent" log "github.com/sirupsen/logrus" "github.com/yuin/goldmark" "github.com/yuin/goldmark/extension" @@ -120,6 +121,34 @@ func IsUserAgentAPlayer(userAgent string) bool { return false } +// IsUserAgentABot returns if a web client user-agent is seen as a bot. +func IsUserAgentABot(userAgent string) bool { + if userAgent == "" { + return false + } + + botStrings := []string{ + "mastodon", + "pleroma", + "applebot", + "whatsapp", + "matrix", + "synapse", + "element", + "rocket.chat", + "duckduckbot", + } + + for _, botString := range botStrings { + if strings.Contains(strings.ToLower(userAgent), botString) { + return true + } + } + + ua := user_agent.New(userAgent) + return ua.Bot() +} + // RenderSimpleMarkdown will return HTML without sanitization or specific formatting rules. func RenderSimpleMarkdown(raw string) string { markdown := goldmark.New(