From 18f656fed236007e0a2d3db1fed76d4cc7275594 Mon Sep 17 00:00:00 2001 From: Alejandro Celaya Date: Sun, 1 May 2022 11:48:20 +0200 Subject: [PATCH] Changed logic when resolving the title of a URL, to ensure only html content is tried to be downloaded, and only until the title tag has been parsed --- CHANGELOG.md | 17 +++++++++ composer.json | 2 +- module/Core/src/Util/UrlValidator.php | 18 ++++++++-- module/Core/test/Util/UrlValidatorTest.php | 41 ++++++++++++++++++++-- 4 files changed, 72 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 882bd2ed..fdaa75e6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,23 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com), and this project adheres to [Semantic Versioning](https://semver.org). +## [Unreleased] +### Added +* *Nothing* + +### Changed +* *Nothing* + +### Deprecated +* *Nothing* + +### Removed +* *Nothing* + +### Fixed +* [#1439](https://github.com/shlinkio/shlink/issues/1439) Fixed crash when trying to auto-resolve titles for URLs which serve large binary files. + + ## [3.1.0] - 2022-04-23 ### Added * [#1294](https://github.com/shlinkio/shlink/issues/1294) Allowed to provide a specific domain when importing URLs from YOURLS. diff --git a/composer.json b/composer.json index 1179e136..d310a103 100644 --- a/composer.json +++ b/composer.json @@ -139,7 +139,7 @@ "test:api": "bin/test/run-api-tests.sh", "test:api:ci": "GENERATE_COVERAGE=yes composer test:api", "infect:ci:base": "infection --threads=4 --log-verbosity=default --only-covered --only-covering-test-cases --skip-initial-tests", - "infect:ci:unit": "@infect:ci:base --coverage=build/coverage-unit --min-msi=85", + "infect:ci:unit": "@infect:ci:base --coverage=build/coverage-unit --min-msi=84", "infect:ci:db": "@infect:ci:base --coverage=build/coverage-db --min-msi=95 --configuration=infection-db.json", "infect:ci:api": "@infect:ci:base --coverage=build/coverage-api --min-msi=80 --configuration=infection-api.json", "infect:ci": "@parallel infect:ci:unit infect:ci:db infect:ci:api", diff --git a/module/Core/src/Util/UrlValidator.php b/module/Core/src/Util/UrlValidator.php index 006ca372..6bd8d76f 100644 --- a/module/Core/src/Util/UrlValidator.php +++ b/module/Core/src/Util/UrlValidator.php @@ -14,6 +14,9 @@ use Shlinkio\Shlink\Core\Options\UrlShortenerOptions; use Throwable; use function preg_match; +use function str_contains; +use function str_starts_with; +use function strtolower; use function trim; use const Shlinkio\Shlink\TITLE_TAG_VALUE; @@ -56,8 +59,18 @@ class UrlValidator implements UrlValidatorInterface, RequestMethodInterface return null; } - $body = $response->getBody()->__toString(); - preg_match(TITLE_TAG_VALUE, $body, $matches); + $contentType = strtolower($response->getHeaderLine('Content-Type')); + if (! str_starts_with($contentType, 'text/html')) { + return null; + } + + $collectedBody = ''; + $body = $response->getBody(); + // With streaming enabled, we can walk the body until the tag is found, and then stop + while (! str_contains($collectedBody, '') && ! $body->eof()) { + $collectedBody .= $body->read(1024); + } + preg_match(TITLE_TAG_VALUE, $collectedBody, $matches); return isset($matches[1]) ? trim($matches[1]) : null; } @@ -73,6 +86,7 @@ class UrlValidator implements UrlValidatorInterface, RequestMethodInterface RequestOptions::IDN_CONVERSION => true, // Making the request with a browser's user agent makes the validation closer to a real user RequestOptions::HEADERS => ['User-Agent' => self::CHROME_USER_AGENT], + RequestOptions::STREAM => true, // This ensures large files are not fully downloaded if not needed ]); } catch (GuzzleException $e) { throw InvalidUrlException::fromUrl($url, $e); diff --git a/module/Core/test/Util/UrlValidatorTest.php b/module/Core/test/Util/UrlValidatorTest.php index 7a943ea0..7e1f9a1b 100644 --- a/module/Core/test/Util/UrlValidatorTest.php +++ b/module/Core/test/Util/UrlValidatorTest.php @@ -132,11 +132,46 @@ class UrlValidatorTest extends TestCase $request->shouldHaveBeenCalledOnce(); } + /** @test */ + public function validateUrlWithTitleReturnsNullWhenAutoResolutionIsEnabledAndReturnedContentTypeIsInvalid(): void + { + $request = $this->httpClient->request(RequestMethodInterface::METHOD_GET, Argument::cetera())->willReturn( + new Response('php://memory', 200, ['Content-Type' => 'application/octet-stream']), + ); + $this->options->autoResolveTitles = true; + + $result = $this->urlValidator->validateUrlWithTitle('http://foobar.com/12345/hello?foo=bar', true); + + self::assertNull($result); + $request->shouldHaveBeenCalledOnce(); + } + + /** @test */ + public function validateUrlWithTitleReturnsNullWhenAutoResolutionIsEnabledAndBodyDoesNotContainTitle(): void + { + $request = $this->httpClient->request(RequestMethodInterface::METHOD_GET, Argument::cetera())->willReturn( + new Response($this->createStreamWithContent('No title'), 200, ['Content-Type' => 'text/html']), + ); + $this->options->autoResolveTitles = true; + + $result = $this->urlValidator->validateUrlWithTitle('http://foobar.com/12345/hello?foo=bar', true); + + self::assertNull($result); + $request->shouldHaveBeenCalledOnce(); + } + private function respWithTitle(): Response { - $body = new Stream('php://temp', 'wr'); - $body->write(' Resolved title'); + $body = $this->createStreamWithContent(' Resolved title'); + return new Response($body, 200, ['Content-Type' => 'TEXT/html; charset=utf-8']); + } - return new Response($body); + private function createStreamWithContent(string $content): Stream + { + $body = new Stream('php://temp', 'wr'); + $body->write($content); + $body->rewind(); + + return $body; } }