Adopt WebDriverAbstract as a solution for active (JavaScript) websites (#3971)

* first working version

---------

Co-authored-by: Dag <me@dvikan.no>
This commit is contained in:
hleskien 2024-02-10 04:42:22 +01:00 committed by GitHub
parent ff7840d60f
commit 8e8028b786
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 473 additions and 1 deletions

View file

@ -0,0 +1,164 @@
<?php
use Facebook\WebDriver\Exception\NoSuchElementException;
use Facebook\WebDriver\Remote\RemoteWebElement;
use Facebook\WebDriver\WebDriverBy;
use Facebook\WebDriver\WebDriverExpectedCondition;
class GULPProjekteBridge extends WebDriverAbstract
{
const NAME = 'GULP Projekte';
const URI = 'https://www.gulp.de/gulp2/g/projekte';
const DESCRIPTION = 'Projektsuche';
const MAINTAINER = 'hleskien';
const MAXITEMS = 60;
/**
* Adds accept language german to the Chrome Options.
*
* @return Facebook\WebDriver\Chrome\ChromeOptions
*/
protected function getBrowserOptions()
{
$chromeOptions = parent::getBrowserOptions();
$chromeOptions->addArguments(['--accept-lang=de']);
return $chromeOptions;
}
/**
* @throws Facebook\WebDriver\Exception\NoSuchElementException
* @throws Facebook\WebDriver\Exception\TimeoutException
*/
protected function clickAwayCookieBanner()
{
$this->getDriver()->wait()->until(WebDriverExpectedCondition::visibilityOfElementLocated(WebDriverBy::id('onetrust-reject-all-handler')));
$buttonRejectCookies = $this->getDriver()->findElement(WebDriverBy::id('onetrust-reject-all-handler'));
$buttonRejectCookies->click();
$this->getDriver()->wait()->until(WebDriverExpectedCondition::invisibilityOfElementLocated(WebDriverBy::id('onetrust-reject-all-handler')));
}
/**
* @throws Facebook\WebDriver\Exception\NoSuchElementException
* @throws Facebook\WebDriver\Exception\TimeoutException
*/
protected function clickNextPage()
{
$nextPage = $this->getDriver()->findElement(WebDriverBy::xpath('//app-linkable-paginator//li[@id="next-page"]/a'));
$href = $nextPage->getAttribute('href');
$nextPage->click();
$this->getDriver()->wait()->until(WebDriverExpectedCondition::not(
WebDriverExpectedCondition::presenceOfElementLocated(
WebDriverBy::xpath('//app-linkable-paginator//li[@id="next-page"]/a[@href="' . $href . '"]')
)
));
}
/**
* Returns the uri of the 'Projektanbieter' logo or false if there is
* no logo present in the item.
*
* @return string | false
*/
protected function getLogo(RemoteWebElement $item)
{
try {
$logo = $item->findElement(WebDriverBy::tagName('img'))->getAttribute('src');
if (str_starts_with($logo, 'http')) {
// different domain
return $logo;
} else {
// relative path
$remove = substr(self::URI, strrpos(self::URI, '/') + 1);
return substr(self::URI, 0, -strlen($remove)) . $logo;
}
} catch (NoSuchElementException $e) {
return false;
}
}
/**
* Converts a string like "vor einigen Minuten" into a reasonable timestamp.
* Long and complicated, but we don't want to be more specific than
* the information we have available.
*
* @throws Exception If the DateInterval can't be parsed.
*/
protected function getTimestamp(string $timeAgo): int
{
$dateTime = new DateTime();
$dateArray = explode(' ', $dateTime->format('Y m d H i s'));
$quantityStr = explode(' ', $timeAgo)[1];
// convert possible word into a number
if (in_array($quantityStr, ['einem', 'einer', 'einigen'])) {
$quantity = 1;
} else {
$quantity = intval($quantityStr);
}
// subtract time ago + inferior units for lower precision
if (str_contains($timeAgo, 'Sekunde')) {
$interval = new DateInterval('PT' . $quantity . 'S');
} elseif (str_contains($timeAgo, 'Minute')) {
$interval = new DateInterval('PT' . $quantity . 'M' . $dateArray[5] . 'S');
} elseif (str_contains($timeAgo, 'Stunde')) {
$interval = new DateInterval('PT' . $quantity . 'H' . $dateArray[4] . 'M' . $dateArray[5] . 'S');
} elseif (str_contains($timeAgo, 'Tag')) {
$interval = new DateInterval('P' . $quantity . 'DT' . $dateArray[3] . 'H' . $dateArray[4] . 'M' . $dateArray[5] . 'S');
} else {
throw new UnexpectedValueException($timeAgo);
}
$dateTime = $dateTime->sub($interval);
return $dateTime->getTimestamp();
}
/**
* The main loop which clicks through search result pages and puts
* the content into the $items array.
*
* @throws Facebook\WebDriver\Exception\NoSuchElementException
* @throws Facebook\WebDriver\Exception\TimeoutException
*/
public function collectData()
{
parent::collectData();
try {
$this->clickAwayCookieBanner();
$this->setIcon($this->getDriver()->findElement(WebDriverBy::xpath('//link[@rel="shortcut icon"]'))->getAttribute('href'));
while (true) {
$items = $this->getDriver()->findElements(WebDriverBy::tagName('app-project-view'));
foreach ($items as $item) {
$feedItem = new FeedItem();
$heading = $item->findElement(WebDriverBy::xpath('.//app-heading-tag/h1/a'));
$feedItem->setTitle($heading->getText());
$feedItem->setURI('https://www.gulp.de' . $heading->getAttribute('href'));
$info = $item->findElement(WebDriverBy::tagName('app-icon-info-list'));
if ($logo = $this->getLogo($item)) {
$feedItem->setEnclosures([$logo]);
}
if (str_contains($info->getText(), 'Projektanbieter:')) {
$feedItem->setAuthor($info->findElement(WebDriverBy::xpath('.//li/span[2]/span'))->getText());
} else {
// mostly "Direkt vom Auftraggeber" or "GULP Agentur"
$feedItem->setAuthor($item->findElement(WebDriverBy::tagName('b'))->getText());
}
$feedItem->setContent($item->findElement(WebDriverBy::xpath('.//p[@class="description"]'))->getText());
$timeAgo = $item->findElement(WebDriverBy::xpath('.//small[contains(@class, "time-ago")]'))->getText();
$feedItem->setTimestamp($this->getTimestamp($timeAgo));
$this->items[] = $feedItem;
}
if (count($this->items) < self::MAXITEMS) {
$this->clickNextPage();
} else {
break;
}
}
} finally {
$this->cleanUp();
}
}
}

View file

@ -0,0 +1,73 @@
<?php
use Facebook\WebDriver\WebDriverBy;
use Facebook\WebDriver\WebDriverExpectedCondition;
class ScalableCapitalBlogBridge extends WebDriverAbstract
{
const NAME = 'Scalable Capital Blog';
const URI = 'https://de.scalable.capital/blog';
const DESCRIPTION = 'Alle Artikel';
const MAINTAINER = 'hleskien';
/**
* Adds accept language german to the Chrome Options.
*
* @return Facebook\WebDriver\Chrome\ChromeOptions
*/
protected function getBrowserOptions()
{
$chromeOptions = parent::getBrowserOptions();
$chromeOptions->addArguments(['--accept-lang=de']);
return $chromeOptions;
}
/**
* Puts the content of the first page into the $items array.
*
* @throws Facebook\WebDriver\Exception\NoSuchElementException
* @throws Facebook\WebDriver\Exception\TimeoutException
*/
public function collectData()
{
parent::collectData();
try {
// wait until last item is loaded
$this->getDriver()->wait()->until(WebDriverExpectedCondition::visibilityOfElementLocated(
WebDriverBy::xpath('//div[contains(@class, "articles")]//div[@class="items"]//div[contains(@class, "item")][15]')
));
$this->setIcon($this->getDriver()->findElement(WebDriverBy::xpath('//link[@rel="shortcut icon"]'))->getAttribute('href'));
$items = $this->getDriver()->findElements(WebDriverBy::xpath('//div[contains(@class, "articles")]//div[@class="items"]//div[contains(@class, "item")]'));
foreach ($items as $item) {
$feedItem = new FeedItem();
$feedItem->setEnclosures(['https://de.scalable.capital' . $item->findElement(WebDriverBy::tagName('img'))->getAttribute('src')]);
$heading = $item->findElement(WebDriverBy::tagName('a'));
$feedItem->setTitle($heading->getText());
$feedItem->setURI('https://de.scalable.capital' . $heading->getAttribute('href'));
$feedItem->setContent($item->findElement(WebDriverBy::xpath('.//div[@class="summary"]'))->getText());
$date = $item->findElement(WebDriverBy::xpath('.//div[@class="published-date"]'))->getText();
$feedItem->setTimestamp($this->formatItemTimestamp($date));
$feedItem->setAuthor($item->findElement(WebDriverBy::xpath('.//div[@class="author"]'))->getText());
$this->items[] = $feedItem;
}
} finally {
$this->cleanUp();
}
}
/**
* Converts the given date (dd.mm.yyyy) into a timestamp.
*
* @param $value string
* @return int
*/
protected function formatItemTimestamp($value)
{
$formatter = new IntlDateFormatter('de', IntlDateFormatter::LONG, IntlDateFormatter::NONE);
return $formatter->parse($value);
}
}

View file

@ -99,6 +99,16 @@ name = "Hidden proxy name"
; false = disabled (default)
by_bridge = false
[webdriver]
; Sets the url of the webdriver or selenium server
selenium_server_url = "http://localhost:4444"
; Sets whether the browser should run in headless mode (no visible ui)
; true = enabled
; false = disabled (default)
headless = false
[authentication]
; HTTP basic authentication

View file

@ -0,0 +1,83 @@
`WebDriverAbstract` extends [`BridgeAbstract`](./02_BridgeAbstract.md) and adds functionality for generating feeds
from active websites that use XMLHttpRequest (XHR) to load content and / or JavaScript to
modify content.
It highly depends on the php-webdriver library which offers Selenium WebDriver bindings for PHP.
- https://github.com/php-webdriver/php-webdriver (Project Repository)
- https://php-webdriver.github.io/php-webdriver/latest/ (API)
Please note that this class is intended as a solution for websites _that cannot be covered
by the other classes_. The WebDriver starts a browser and is therefore very resource-intensive.
# Configuration
You need a running WebDriver to use bridges that depend on `WebDriverAbstract`.
The easiest way is to start the Selenium server from the project of the same name:
```
docker run -d -p 4444:4444 --shm-size="2g" docker.io/selenium/standalone-chrome:latest
```
- https://github.com/SeleniumHQ/docker-selenium
With these parameters only one browser window can be started at a time.
On a multi-user site, Selenium Grid should be used
and the number of sessions should be adjusted to the number of processor cores.
Finally, the `config.ini.php` file must be adjusted so that the WebDriver
can find the Selenium server:
```
[webdriver]
selenium_server_url = "http://localhost:4444"
```
# Development
While you are programming a new bridge, it is easier to start a local WebDriver because then you can see what is happening and where the errors are. I've also had good experience recording the process with a screen video to find any timing problems.
```
chromedriver --port=4444
```
- https://chromedriver.chromium.org/
If you start rss-bridge from a container, then Chrome driver is only accessible
if you call it with the `--allowed-ips` option so that it binds to all network interfaces.
```
chromedriver --port=4444 --allowed-ips=192.168.1.42
```
The **most important rule** is that after an event such as loading the web page
or pressing a button, you often have to explicitly wait for the desired elements to appear.
A simple example is the bridge `ScalableCapitalBlogBridge.php`.
A more complex and relatively complete example is the bridge `GULPProjekteBridge.php`.
# Template
Use this template to create your own bridge.
```PHP
<?php
class MyBridge extends WebDriverAbstract
{
const NAME = 'My Bridge';
const URI = 'https://www.example.org';
const DESCRIPTION = 'Further description';
const MAINTAINER = 'your name';
public function collectData()
{
parent::collectData();
try {
// TODO
} finally {
$this->cleanUp();
}
}
}
```

View file

@ -8,6 +8,7 @@ Base class | Description
-----------|------------
[`BridgeAbstract`](./02_BridgeAbstract.md) | This class is intended for standard _Bridges_ that need to filter HTML pages for content.
[`FeedExpander`](./03_FeedExpander.md) | Expand/modify existing feed urls
[`XPathAbstract`](./04_XPathAbstract.md) | This class is meant as an alternative base class for bridge implementations. It offers preliminary functionality for generating feeds based on _XPath expressions_.
[`WebDriverAbstract`](./04_WebDriverAbstract) |
[`XPathAbstract`](./05_XPathAbstract) | This class is meant as an alternative base class for bridge implementations. It offers preliminary functionality for generating feeds based on _XPath expressions_.
For more information about how to create a new _Bridge_, read [How to create a new Bridge?](./01_How_to_create_a_new_bridge.md)

141
lib/WebDriverAbstract.php Normal file
View file

@ -0,0 +1,141 @@
<?php
use Facebook\WebDriver\Chrome\ChromeOptions;
use Facebook\WebDriver\Remote\DesiredCapabilities;
use Facebook\WebDriver\Remote\RemoteWebDriver;
use Facebook\WebDriver\WebDriverCapabilities;
/**
* An alternative abstract class for bridges depending on webdriver
*
* This class is meant a solution for active websites that use
* XMLHttpRequest (XHR) to load content and/or use JavaScript to
* change content. This class depends on a working webdriver setup.
*/
abstract class WebDriverAbstract extends BridgeAbstract
{
/**
* Holds the remote webdriver object, including configuration and
* connection.
*
* @var RemoteWebDriver
*/
protected RemoteWebDriver $driver;
/**
* Holds the uri of the feed's icon.
*
* @var string | null
*/
private $feedIcon;
/**
* Returns the webdriver object.
*
* @return RemoteWebDriver
*/
protected function getDriver(): RemoteWebDriver
{
return $this->driver;
}
/**
* Returns the uri of the feed's icon.
*
* @return string
*/
public function getIcon()
{
return $this->feedIcon ?: parent::getIcon();
}
/**
* Sets the uri of the feed's icon.
*
* @param $iconurl string
*/
protected function setIcon($iconurl)
{
$this->feedIcon = $iconurl;
}
/**
* Returns the ChromeOptions object.
*
* If the configuration parameter 'headless' is set to true, the
* argument '--headless' is added. Override this to change or add
* more options.
*
* @return ChromeOptions
*/
protected function getBrowserOptions()
{
$chromeOptions = new ChromeOptions();
if (Configuration::getConfig('webdriver', 'headless')) {
$chromeOptions->addArguments(['--headless']); // --window-size=1024,1024
}
return $chromeOptions;
}
/**
* Returns the DesiredCapabilities object for the Chrome browser.
*
* The Chrome options are added. Override this to change or add
* more capabilities.
*
* @return WebDriverCapabilities
*/
protected function getDesiredCapabilities(): WebDriverCapabilities
{
$desiredCapabilities = DesiredCapabilities::chrome();
$desiredCapabilities->setCapability(ChromeOptions::CAPABILITY, $this->getBrowserOptions());
return $desiredCapabilities;
}
/**
* Constructs the remote webdriver with the url of the remote (Selenium)
* webdriver server and the desired capabilities.
*
* This should be called in collectData() first.
*/
protected function prepareWebDriver()
{
$server = Configuration::getConfig('webdriver', 'selenium_server_url');
$this->driver = RemoteWebDriver::create($server, $this->getDesiredCapabilities());
}
/**
* Maximizes the remote browser window (often important for reactive sites
* which change their appearance depending on the window size) and opens
* the uri set in the constant URI.
*/
protected function prepareWindow()
{
$this->getDriver()->manage()->window()->maximize();
$this->getDriver()->get($this->getURI());
}
/**
* Closes the remote browser window and shuts down the remote webdriver
* connection.
*
* This must be called at the end of scraping, for example within a
* 'finally' block.
*/
protected function cleanUp()
{
$this->getDriver()->quit();
}
/**
* Do your web scraping here and fill the $items array.
*
* Override this but call parent() first.
* Don't forget to call cleanUp() at the end.
*/
public function collectData()
{
$this->prepareWebDriver();
$this->prepareWindow();
}
}