mirror of
https://github.com/shlinkio/shlink.git
synced 2024-11-22 21:02:24 +03:00
Added detection of visits from potential bots
This commit is contained in:
parent
663ae9f6bb
commit
9fa32b5b6b
16 changed files with 123 additions and 19 deletions
|
@ -25,6 +25,7 @@
|
|||
"geoip2/geoip2": "^2.9",
|
||||
"guzzlehttp/guzzle": "^7.0",
|
||||
"happyr/doctrine-specification": "^2.0",
|
||||
"jaybizzle/crawler-detect": "^1.2",
|
||||
"laminas/laminas-config": "^3.3",
|
||||
"laminas/laminas-config-aggregator": "^1.1",
|
||||
"laminas/laminas-diactoros": "^2.1.3",
|
||||
|
|
28
data/migrations/Version20210522124633.php
Normal file
28
data/migrations/Version20210522124633.php
Normal file
|
@ -0,0 +1,28 @@
|
|||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace ShlinkMigrations;
|
||||
|
||||
use Doctrine\DBAL\Schema\Schema;
|
||||
use Doctrine\DBAL\Types\Types;
|
||||
use Doctrine\Migrations\AbstractMigration;
|
||||
|
||||
final class Version20210522124633 extends AbstractMigration
|
||||
{
|
||||
private const POTENTIAL_BOT_COLUMN = 'potential_bot';
|
||||
|
||||
public function up(Schema $schema): void
|
||||
{
|
||||
$visits = $schema->getTable('visits');
|
||||
$this->skipIf($visits->hasColumn(self::POTENTIAL_BOT_COLUMN));
|
||||
$visits->addColumn(self::POTENTIAL_BOT_COLUMN, Types::BOOLEAN, ['default' => false]);
|
||||
}
|
||||
|
||||
public function down(Schema $schema): void
|
||||
{
|
||||
$visits = $schema->getTable('visits');
|
||||
$this->skipIf(! $visits->hasColumn(self::POTENTIAL_BOT_COLUMN));
|
||||
$visits->dropColumn(self::POTENTIAL_BOT_COLUMN);
|
||||
}
|
||||
}
|
|
@ -190,6 +190,10 @@
|
|||
},
|
||||
"visitLocation": {
|
||||
"$ref": "#/components/schemas/VisitLocation"
|
||||
},
|
||||
"potentialBot": {
|
||||
"type": "boolean",
|
||||
"description": "Tells if Shlink thinks this visits comes potentially from a bot or crawler"
|
||||
}
|
||||
},
|
||||
"example": {
|
||||
|
@ -204,7 +208,8 @@
|
|||
"longitude": -122.0946,
|
||||
"regionName": "California",
|
||||
"timezone": "America/Los_Angeles"
|
||||
}
|
||||
},
|
||||
"potentialBot": false
|
||||
}
|
||||
},
|
||||
"OrphanVisit": {
|
||||
|
@ -243,6 +248,7 @@
|
|||
"regionName": "California",
|
||||
"timezone": "America/Los_Angeles"
|
||||
},
|
||||
"potentialBot": false,
|
||||
"visitedUrl": "https://doma.in",
|
||||
"type": "base_url"
|
||||
}
|
||||
|
|
|
@ -17,6 +17,10 @@
|
|||
},
|
||||
"visitLocation": {
|
||||
"$ref": "./VisitLocation.json"
|
||||
},
|
||||
"potentialBot": {
|
||||
"type": "boolean",
|
||||
"description": "Tells if Shlink thinks this visits comes potentially from a bot or crawler"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -98,7 +98,8 @@
|
|||
"referer": "https://twitter.com",
|
||||
"date": "2015-08-20T05:05:03+04:00",
|
||||
"userAgent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0 Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
|
||||
"visitLocation": null
|
||||
"visitLocation": null,
|
||||
"potentialBot": false
|
||||
},
|
||||
{
|
||||
"referer": "https://t.co",
|
||||
|
@ -112,13 +113,15 @@
|
|||
"longitude": -122.0946,
|
||||
"regionName": "California",
|
||||
"timezone": "America/Los_Angeles"
|
||||
}
|
||||
},
|
||||
"potentialBot": false
|
||||
},
|
||||
{
|
||||
"referer": null,
|
||||
"date": "2015-08-20T05:05:03+04:00",
|
||||
"userAgent": "some_web_crawler/1.4",
|
||||
"visitLocation": null
|
||||
"visitLocation": null,
|
||||
"potentialBot": true
|
||||
}
|
||||
],
|
||||
"pagination": {
|
||||
|
|
|
@ -95,7 +95,8 @@
|
|||
"referer": "https://twitter.com",
|
||||
"date": "2015-08-20T05:05:03+04:00",
|
||||
"userAgent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0 Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
|
||||
"visitLocation": null
|
||||
"visitLocation": null,
|
||||
"potentialBot": false
|
||||
},
|
||||
{
|
||||
"referer": "https://t.co",
|
||||
|
@ -109,13 +110,15 @@
|
|||
"longitude": -122.0946,
|
||||
"regionName": "California",
|
||||
"timezone": "America/Los_Angeles"
|
||||
}
|
||||
},
|
||||
"potentialBot": false
|
||||
},
|
||||
{
|
||||
"referer": null,
|
||||
"date": "2015-08-20T05:05:03+04:00",
|
||||
"userAgent": "some_web_crawler/1.4",
|
||||
"visitLocation": null
|
||||
"visitLocation": null,
|
||||
"potentialBot": true
|
||||
}
|
||||
],
|
||||
"pagination": {
|
||||
|
|
|
@ -87,6 +87,7 @@
|
|||
"date": "2015-08-20T05:05:03+04:00",
|
||||
"userAgent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0 Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
|
||||
"visitLocation": null,
|
||||
"potentialBot": false,
|
||||
"visitedUrl": "https://doma.in",
|
||||
"type": "base_url"
|
||||
},
|
||||
|
@ -103,6 +104,7 @@
|
|||
"regionName": "California",
|
||||
"timezone": "America/Los_Angeles"
|
||||
},
|
||||
"potentialBot": false,
|
||||
"visitedUrl": "https://doma.in/foo",
|
||||
"type": "invalid_short_url"
|
||||
},
|
||||
|
@ -111,6 +113,7 @@
|
|||
"date": "2015-08-20T05:05:03+04:00",
|
||||
"userAgent": "some_web_crawler/1.4",
|
||||
"visitLocation": null,
|
||||
"potentialBot": true,
|
||||
"visitedUrl": "https://doma.in/foo/bar/baz",
|
||||
"type": "regular_404"
|
||||
}
|
||||
|
|
|
@ -65,4 +65,9 @@ return static function (ClassMetadata $metadata, array $emConfig): void {
|
|||
->columnName('type')
|
||||
->length(255)
|
||||
->build();
|
||||
|
||||
$builder->createField('potentialBot', Types::BOOLEAN)
|
||||
->columnName('potential_bot')
|
||||
->option('default', false)
|
||||
->build();
|
||||
};
|
||||
|
|
|
@ -7,6 +7,7 @@ namespace Shlinkio\Shlink\Core;
|
|||
use Cake\Chronos\Chronos;
|
||||
use DateTimeInterface;
|
||||
use Fig\Http\Message\StatusCodeInterface;
|
||||
use Jaybizzle\CrawlerDetect\CrawlerDetect;
|
||||
use Laminas\InputFilter\InputFilter;
|
||||
use PUGX\Shortid\Factory as ShortIdFactory;
|
||||
use Shlinkio\Shlink\Common\Util\DateRange;
|
||||
|
@ -128,3 +129,13 @@ function kebabCaseToCamelCase(string $name): string
|
|||
{
|
||||
return lcfirst(str_replace(' ', '', ucwords(str_replace('-', ' ', $name))));
|
||||
}
|
||||
|
||||
function isCrawler(string $userAgent): bool
|
||||
{
|
||||
static $detector;
|
||||
if ($detector === null) {
|
||||
$detector = new CrawlerDetect();
|
||||
}
|
||||
|
||||
return $detector->isCrawler($userAgent);
|
||||
}
|
||||
|
|
|
@ -13,6 +13,8 @@ use Shlinkio\Shlink\Core\Model\Visitor;
|
|||
use Shlinkio\Shlink\Core\Visit\Model\VisitLocationInterface;
|
||||
use Shlinkio\Shlink\Importer\Model\ImportedShlinkVisit;
|
||||
|
||||
use function Shlinkio\Shlink\Core\isCrawler;
|
||||
|
||||
class Visit extends AbstractEntity implements JsonSerializable
|
||||
{
|
||||
public const TYPE_VALID_SHORT_URL = 'valid_short_url';
|
||||
|
@ -29,6 +31,7 @@ class Visit extends AbstractEntity implements JsonSerializable
|
|||
private string $type;
|
||||
private ?ShortUrl $shortUrl;
|
||||
private ?VisitLocation $visitLocation = null;
|
||||
private bool $potentialBot;
|
||||
|
||||
private function __construct(?ShortUrl $shortUrl, string $type)
|
||||
{
|
||||
|
@ -49,6 +52,7 @@ class Visit extends AbstractEntity implements JsonSerializable
|
|||
{
|
||||
$instance = new self($shortUrl, self::TYPE_IMPORTED);
|
||||
$instance->userAgent = $importedVisit->userAgent();
|
||||
$instance->potentialBot = isCrawler($instance->userAgent);
|
||||
$instance->referer = $importedVisit->referer();
|
||||
$instance->date = Chronos::instance($importedVisit->date());
|
||||
|
||||
|
@ -88,6 +92,7 @@ class Visit extends AbstractEntity implements JsonSerializable
|
|||
$this->referer = $visitor->getReferer();
|
||||
$this->remoteAddr = $this->processAddress($anonymize, $visitor->getRemoteAddress());
|
||||
$this->visitedUrl = $visitor->getVisitedUrl();
|
||||
$this->potentialBot = $visitor->isPotentialBot();
|
||||
}
|
||||
|
||||
private function processAddress(bool $anonymize, ?string $address): ?string
|
||||
|
@ -166,6 +171,7 @@ class Visit extends AbstractEntity implements JsonSerializable
|
|||
'date' => $this->date->toAtomString(),
|
||||
'userAgent' => $this->userAgent,
|
||||
'visitLocation' => $this->visitLocation,
|
||||
'potentialBot' => $this->potentialBot,
|
||||
];
|
||||
}
|
||||
|
||||
|
|
|
@ -8,6 +8,7 @@ use Psr\Http\Message\ServerRequestInterface;
|
|||
use Shlinkio\Shlink\Common\Middleware\IpAddressMiddlewareFactory;
|
||||
use Shlinkio\Shlink\Core\Options\TrackingOptions;
|
||||
|
||||
use function Shlinkio\Shlink\Core\isCrawler;
|
||||
use function substr;
|
||||
|
||||
final class Visitor
|
||||
|
@ -21,6 +22,7 @@ final class Visitor
|
|||
private string $referer;
|
||||
private string $visitedUrl;
|
||||
private ?string $remoteAddress;
|
||||
private bool $potentialBot;
|
||||
|
||||
public function __construct(string $userAgent, string $referer, ?string $remoteAddress, string $visitedUrl)
|
||||
{
|
||||
|
@ -28,6 +30,7 @@ final class Visitor
|
|||
$this->referer = $this->cropToLength($referer, self::REFERER_MAX_LENGTH);
|
||||
$this->visitedUrl = $this->cropToLength($visitedUrl, self::VISITED_URL_MAX_LENGTH);
|
||||
$this->remoteAddress = $this->cropToLength($remoteAddress, self::REMOTE_ADDRESS_MAX_LENGTH);
|
||||
$this->potentialBot = isCrawler($userAgent);
|
||||
}
|
||||
|
||||
private function cropToLength(?string $value, int $length): ?string
|
||||
|
@ -70,14 +73,22 @@ final class Visitor
|
|||
return $this->visitedUrl;
|
||||
}
|
||||
|
||||
public function isPotentialBot(): bool
|
||||
{
|
||||
return $this->potentialBot;
|
||||
}
|
||||
|
||||
public function normalizeForTrackingOptions(TrackingOptions $options): self
|
||||
{
|
||||
$instance = self::emptyInstance();
|
||||
$instance = new self(
|
||||
$options->disableUaTracking() ? '' : $this->userAgent,
|
||||
$options->disableReferrerTracking() ? '' : $this->referer,
|
||||
$options->disableIpTracking() ? null : $this->remoteAddress,
|
||||
$this->visitedUrl,
|
||||
);
|
||||
|
||||
$instance->userAgent = $options->disableUaTracking() ? '' : $this->userAgent;
|
||||
$instance->referer = $options->disableReferrerTracking() ? '' : $this->referer;
|
||||
$instance->remoteAddress = $options->disableIpTracking() ? null : $this->remoteAddress;
|
||||
$instance->visitedUrl = $this->visitedUrl;
|
||||
// Keep the fact that the visit was a potential bot, even if we no longer save the user agent
|
||||
$instance->potentialBot = $this->potentialBot;
|
||||
|
||||
return $instance;
|
||||
}
|
||||
|
|
|
@ -12,19 +12,35 @@ use Shlinkio\Shlink\Core\Model\Visitor;
|
|||
|
||||
class VisitTest extends TestCase
|
||||
{
|
||||
/** @test */
|
||||
public function isProperlyJsonSerialized(): void
|
||||
/**
|
||||
* @test
|
||||
* @dataProvider provideUserAgents
|
||||
*/
|
||||
public function isProperlyJsonSerialized(string $userAgent, bool $expectedToBePotentialBot): void
|
||||
{
|
||||
$visit = Visit::forValidShortUrl(ShortUrl::createEmpty(), new Visitor('Chrome', 'some site', '1.2.3.4', ''));
|
||||
$visit = Visit::forValidShortUrl(ShortUrl::createEmpty(), new Visitor($userAgent, 'some site', '1.2.3.4', ''));
|
||||
|
||||
self::assertEquals([
|
||||
'referer' => 'some site',
|
||||
'date' => $visit->getDate()->toAtomString(),
|
||||
'userAgent' => 'Chrome',
|
||||
'userAgent' => $userAgent,
|
||||
'visitLocation' => null,
|
||||
'potentialBot' => $expectedToBePotentialBot,
|
||||
], $visit->jsonSerialize());
|
||||
}
|
||||
|
||||
public function provideUserAgents(): iterable
|
||||
{
|
||||
yield 'Chrome' => [
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
|
||||
false,
|
||||
];
|
||||
yield 'Firefox' => ['Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0', false];
|
||||
yield 'Facebook' => ['cf-facebook', true];
|
||||
yield 'Twitter' => ['IDG Twitter Links Resolver', true];
|
||||
yield 'Guzzle' => ['guzzlehttp', true];
|
||||
}
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @dataProvider provideAddresses
|
||||
|
|
|
@ -66,6 +66,7 @@ class MercureUpdatesGeneratorTest extends TestCase
|
|||
'userAgent' => '',
|
||||
'visitLocation' => null,
|
||||
'date' => $visit->getDate()->toAtomString(),
|
||||
'potentialBot' => false,
|
||||
],
|
||||
], json_decode($update->getData()));
|
||||
}
|
||||
|
@ -91,6 +92,7 @@ class MercureUpdatesGeneratorTest extends TestCase
|
|||
'userAgent' => '',
|
||||
'visitLocation' => null,
|
||||
'date' => $orphanVisit->getDate()->toAtomString(),
|
||||
'potentialBot' => false,
|
||||
'visitedUrl' => $orphanVisit->visitedUrl(),
|
||||
'type' => $orphanVisit->type(),
|
||||
],
|
||||
|
|
|
@ -42,6 +42,7 @@ class OrphanVisitDataTransformerTest extends TestCase
|
|||
'date' => $visit->getDate()->toAtomString(),
|
||||
'userAgent' => '',
|
||||
'visitLocation' => null,
|
||||
'potentialBot' => false,
|
||||
'visitedUrl' => '',
|
||||
'type' => Visit::TYPE_BASE_URL,
|
||||
],
|
||||
|
@ -57,6 +58,7 @@ class OrphanVisitDataTransformerTest extends TestCase
|
|||
'date' => $visit->getDate()->toAtomString(),
|
||||
'userAgent' => 'foo',
|
||||
'visitLocation' => null,
|
||||
'potentialBot' => false,
|
||||
'visitedUrl' => 'https://example.com/foo',
|
||||
'type' => Visit::TYPE_INVALID_SHORT_URL,
|
||||
],
|
||||
|
@ -74,6 +76,7 @@ class OrphanVisitDataTransformerTest extends TestCase
|
|||
'date' => $visit->getDate()->toAtomString(),
|
||||
'userAgent' => 'user-agent',
|
||||
'visitLocation' => $location,
|
||||
'potentialBot' => false,
|
||||
'visitedUrl' => 'https://doma.in/foo/bar',
|
||||
'type' => Visit::TYPE_REGULAR_404,
|
||||
],
|
||||
|
|
|
@ -12,17 +12,18 @@ class OrphanVisitsTest extends ApiTestCase
|
|||
private const INVALID_SHORT_URL = [
|
||||
'referer' => 'https://doma.in/foo',
|
||||
'date' => '2020-03-01T00:00:00+00:00',
|
||||
'userAgent' => 'shlink-tests-agent',
|
||||
'userAgent' => 'cf-facebook',
|
||||
'visitLocation' => null,
|
||||
'potentialBot' => true,
|
||||
'visitedUrl' => 'foo.com',
|
||||
'type' => 'invalid_short_url',
|
||||
|
||||
];
|
||||
private const REGULAR_NOT_FOUND = [
|
||||
'referer' => 'https://doma.in/foo/bar',
|
||||
'date' => '2020-02-01T00:00:00+00:00',
|
||||
'userAgent' => 'shlink-tests-agent',
|
||||
'visitLocation' => null,
|
||||
'potentialBot' => false,
|
||||
'visitedUrl' => '',
|
||||
'type' => 'regular_404',
|
||||
];
|
||||
|
@ -31,6 +32,7 @@ class OrphanVisitsTest extends ApiTestCase
|
|||
'date' => '2020-01-01T00:00:00+00:00',
|
||||
'userAgent' => 'shlink-tests-agent',
|
||||
'visitLocation' => null,
|
||||
'potentialBot' => false,
|
||||
'visitedUrl' => '',
|
||||
'type' => 'base_url',
|
||||
];
|
||||
|
|
|
@ -58,7 +58,7 @@ class VisitsFixture extends AbstractFixture implements DependentFixtureInterface
|
|||
'2020-02-01',
|
||||
));
|
||||
$manager->persist($this->setVisitDate(
|
||||
Visit::forInvalidShortUrl(new Visitor('shlink-tests-agent', 'https://doma.in/foo', '1.2.3.4', 'foo.com')),
|
||||
Visit::forInvalidShortUrl(new Visitor('cf-facebook', 'https://doma.in/foo', '1.2.3.4', 'foo.com')),
|
||||
'2020-03-01',
|
||||
));
|
||||
|
||||
|
|
Loading…
Reference in a new issue