From 163244f40f5e3a22a086e2ca1c6ca834b73b91e9 Mon Sep 17 00:00:00 2001 From: Alejandro Celaya Date: Sun, 21 Apr 2024 17:09:20 +0200 Subject: [PATCH] Add option to allow all URLs to be crawlable via robots.txt --- CHANGELOG.md | 17 +++++++++ composer.json | 2 +- config/autoload/installer.global.php | 1 + config/autoload/robots.global.php | 13 +++++++ module/Core/config/dependencies.config.php | 2 +- module/Core/src/Action/RobotsAction.php | 10 ++++-- module/Core/src/Config/EnvVars.php | 3 +- module/Core/test/Action/RobotsActionTest.php | 38 +++++++++++++++----- 8 files changed, 72 insertions(+), 14 deletions(-) create mode 100644 config/autoload/robots.global.php diff --git a/CHANGELOG.md b/CHANGELOG.md index ab6c1412..d2b93f65 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,23 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com), and this project adheres to [Semantic Versioning](https://semver.org). +## [Unreleased] +### Added +* [#2018](https://github.com/shlinkio/shlink/issues/2018) Add option to allow all short URLs to be unconditionally crawlable in robots.txt, via `ROBOTS_ALLOW_ALL_SHORT_URLS=true` env var, or config options. + +### Changed +* *Nothing* + +### Deprecated +* *Nothing* + +### Removed +* *Nothing* + +### Fixed +* *Nothing* + + ## [4.1.0] - 2024-04-14 ### Added * [#1330](https://github.com/shlinkio/shlink/issues/1330) All visit-related endpoints now expose the `visitedUrl` prop for any visit. diff --git a/composer.json b/composer.json index 517caf0c..c00ebee8 100644 --- a/composer.json +++ b/composer.json @@ -47,7 +47,7 @@ "shlinkio/shlink-config": "^3.0", "shlinkio/shlink-event-dispatcher": "^4.1", "shlinkio/shlink-importer": "^5.3.2", - "shlinkio/shlink-installer": "^9.1", + "shlinkio/shlink-installer": "dev-develop#11e66d8 as 9.2", "shlinkio/shlink-ip-geolocation": "^4.0", "shlinkio/shlink-json": "^1.1", "spiral/roadrunner": "^2023.3", diff --git a/config/autoload/installer.global.php b/config/autoload/installer.global.php index 4ebd6716..0fda5d06 100644 --- a/config/autoload/installer.global.php +++ b/config/autoload/installer.global.php @@ -45,6 +45,7 @@ return [ Option\UrlShortener\EnableMultiSegmentSlugsConfigOption::class, Option\UrlShortener\EnableTrailingSlashConfigOption::class, Option\UrlShortener\ShortUrlModeConfigOption::class, + Option\UrlShortener\RobotsAllowAllShortUrlsConfigOption::class, Option\Tracking\IpAnonymizationConfigOption::class, Option\Tracking\OrphanVisitsTrackingConfigOption::class, Option\Tracking\DisableTrackParamConfigOption::class, diff --git a/config/autoload/robots.global.php b/config/autoload/robots.global.php new file mode 100644 index 00000000..0ab9c5d2 --- /dev/null +++ b/config/autoload/robots.global.php @@ -0,0 +1,13 @@ + [ + 'allow-all-short-urls' => (bool) Config\EnvVars::ROBOTS_ALLOW_ALL_SHORT_URLS->loadFromEnv(false), + ], + +]; diff --git a/module/Core/config/dependencies.config.php b/module/Core/config/dependencies.config.php index 5fcc8e44..24f32360 100644 --- a/module/Core/config/dependencies.config.php +++ b/module/Core/config/dependencies.config.php @@ -189,7 +189,7 @@ return [ 'Logger_Shlink', Options\QrCodeOptions::class, ], - Action\RobotsAction::class => [Crawling\CrawlingHelper::class], + Action\RobotsAction::class => [Crawling\CrawlingHelper::class, 'config.robots.allow-all-short-urls'], ShortUrl\Resolver\PersistenceShortUrlRelationResolver::class => [ 'em', diff --git a/module/Core/src/Action/RobotsAction.php b/module/Core/src/Action/RobotsAction.php index 214dc7a0..cb3c99ea 100644 --- a/module/Core/src/Action/RobotsAction.php +++ b/module/Core/src/Action/RobotsAction.php @@ -15,9 +15,9 @@ use function sprintf; use const PHP_EOL; -class RobotsAction implements RequestHandlerInterface, StatusCodeInterface +readonly class RobotsAction implements RequestHandlerInterface, StatusCodeInterface { - public function __construct(private readonly CrawlingHelperInterface $crawlingHelper) + public function __construct(private CrawlingHelperInterface $crawlingHelper, private bool $allowAllShortUrls) { } @@ -37,6 +37,12 @@ class RobotsAction implements RequestHandlerInterface, StatusCodeInterface ROBOTS; + if ($this->allowAllShortUrls) { + // Disallow rest URLs, but allow all short codes + yield 'Disallow: /rest/'; + return; + } + $shortCodes = $this->crawlingHelper->listCrawlableShortCodes(); foreach ($shortCodes as $shortCode) { yield sprintf('Allow: /%s%s', $shortCode, PHP_EOL); diff --git a/module/Core/src/Config/EnvVars.php b/module/Core/src/Config/EnvVars.php index bae68e84..59fafb17 100644 --- a/module/Core/src/Config/EnvVars.php +++ b/module/Core/src/Config/EnvVars.php @@ -69,8 +69,9 @@ enum EnvVars: string case DEFAULT_DOMAIN = 'DEFAULT_DOMAIN'; case AUTO_RESOLVE_TITLES = 'AUTO_RESOLVE_TITLES'; case REDIRECT_APPEND_EXTRA_PATH = 'REDIRECT_APPEND_EXTRA_PATH'; - case TIMEZONE = 'TIMEZONE'; case MULTI_SEGMENT_SLUGS_ENABLED = 'MULTI_SEGMENT_SLUGS_ENABLED'; + case ROBOTS_ALLOW_ALL_SHORT_URLS = 'ROBOTS_ALLOW_ALL_SHORT_URLS'; + case TIMEZONE = 'TIMEZONE'; case MEMORY_LIMIT = 'MEMORY_LIMIT'; public function loadFromEnv(mixed $default = null): mixed diff --git a/module/Core/test/Action/RobotsActionTest.php b/module/Core/test/Action/RobotsActionTest.php index 6523c2c5..32c1b036 100644 --- a/module/Core/test/Action/RobotsActionTest.php +++ b/module/Core/test/Action/RobotsActionTest.php @@ -14,24 +14,25 @@ use Shlinkio\Shlink\Core\Crawling\CrawlingHelperInterface; class RobotsActionTest extends TestCase { - private RobotsAction $action; private MockObject & CrawlingHelperInterface $helper; protected function setUp(): void { $this->helper = $this->createMock(CrawlingHelperInterface::class); - $this->action = new RobotsAction($this->helper); } #[Test, DataProvider('provideShortCodes')] - public function buildsRobotsLinesFromCrawlableShortCodes(array $shortCodes, string $expected): void - { + public function buildsRobotsLinesFromCrawlableShortCodes( + array $shortCodes, + bool $allowAllShortUrls, + string $expected, + ): void { $this->helper - ->expects($this->once()) + ->expects($allowAllShortUrls ? $this->never() : $this->once()) ->method('listCrawlableShortCodes') ->willReturn($shortCodes); - $response = $this->action->handle(ServerRequestFactory::fromGlobals()); + $response = $this->action($allowAllShortUrls)->handle(ServerRequestFactory::fromGlobals()); self::assertEquals(200, $response->getStatusCode()); self::assertEquals($expected, $response->getBody()->__toString()); @@ -40,7 +41,7 @@ class RobotsActionTest extends TestCase public static function provideShortCodes(): iterable { - yield 'three short codes' => [['foo', 'bar', 'baz'], << [['foo', 'bar', 'baz'], false, << [['foo', 'bar', 'some', 'thing', 'baz'], << [['foo', 'bar', 'some', 'thing', 'baz'], false, << [[], << [[], false, << [['foo', 'bar', 'some'], true, << [[], true, <<helper, allowAllShortUrls: $allowAllShortUrls); } }