From 9b50151f17b5921b68b3c413a26edf8ec6cdc6f8 Mon Sep 17 00:00:00 2001
From: Daenney <daenney@users.noreply.github.com>
Date: Fri, 2 Aug 2024 18:22:39 +0200
Subject: [PATCH] [feature] Beef up our AI opt-outs (#3165)

* [chore] Synchronise our robots.txt with upstream

* [feature] Add headers to escape AI crawlers

This adds 2 headers that a number of AI crawlers respect to signal that
content should not be included in their datasets.
---
 internal/middleware/extraheaders.go | 7 +++++++
 internal/web/robots.go              | 9 +++++++++
 2 files changed, 16 insertions(+)

diff --git a/internal/middleware/extraheaders.go b/internal/middleware/extraheaders.go
index c75b65551..fb91bcc93 100644
--- a/internal/middleware/extraheaders.go
+++ b/internal/middleware/extraheaders.go
@@ -44,5 +44,12 @@ func ExtraHeaders() gin.HandlerFunc {
 		//
 		// See: https://github.com/patcg-individual-drafts/topics
 		c.Header("Permissions-Policy", "browsing-topics=()")
+
+		// Some AI scrapers respect the following tags to opt-out
+		// of their crawling and datasets.
+		c.Header("X-Robots-Tag", "noimageai")
+		// c.Header calls .Set(), but we want to emit the header
+		// twice, not override it.
+		c.Writer.Header().Add("X-Robots-Tag", "noai")
 	}
 }
diff --git a/internal/web/robots.go b/internal/web/robots.go
index 39708eb55..3309de97c 100644
--- a/internal/web/robots.go
+++ b/internal/web/robots.go
@@ -43,15 +43,24 @@ User-agent: Claude-Web
 User-agent: cohere-ai
 User-agent: Diffbot
 User-agent: FacebookBot
+User-agent: facebookexternalhit
 User-agent: FriendlyCrawler
 User-agent: Google-Extended
 User-agent: GoogleOther
+User-agent: GoogleOther-Image
+User-agent: GoogleOther-Video
 User-agent: GPTBot
 User-agent: ImagesiftBot
 User-agent: img2dataset
+User-agent: Meta-ExternalAgent
+User-agent: OAI-SearchBot
 User-agent: omgili
 User-agent: omgilibot
 User-agent: PerplexityBot
+User-agent: PetalBot
+User-agent: Scrapy
+User-agent: Timpibot
+User-agent: VelenPublicWebCrawler
 User-agent: YouBot
 Disallow: /