mirror of
https://github.com/RSS-Bridge/rss-bridge.git
synced 2025-02-16 07:09:54 +03:00
Adopt WebDriverAbstract as a solution for active (JavaScript) websites (#3971)
* first working version --------- Co-authored-by: Dag <me@dvikan.no>
This commit is contained in:
parent
ff7840d60f
commit
8e8028b786
7 changed files with 473 additions and 1 deletions
164
bridges/GULPProjekteBridge.php
Normal file
164
bridges/GULPProjekteBridge.php
Normal file
|
@ -0,0 +1,164 @@
|
|||
<?php
|
||||
|
||||
use Facebook\WebDriver\Exception\NoSuchElementException;
|
||||
use Facebook\WebDriver\Remote\RemoteWebElement;
|
||||
use Facebook\WebDriver\WebDriverBy;
|
||||
use Facebook\WebDriver\WebDriverExpectedCondition;
|
||||
|
||||
class GULPProjekteBridge extends WebDriverAbstract
|
||||
{
|
||||
const NAME = 'GULP Projekte';
|
||||
const URI = 'https://www.gulp.de/gulp2/g/projekte';
|
||||
const DESCRIPTION = 'Projektsuche';
|
||||
const MAINTAINER = 'hleskien';
|
||||
|
||||
const MAXITEMS = 60;
|
||||
|
||||
/**
|
||||
* Adds accept language german to the Chrome Options.
|
||||
*
|
||||
* @return Facebook\WebDriver\Chrome\ChromeOptions
|
||||
*/
|
||||
protected function getBrowserOptions()
|
||||
{
|
||||
$chromeOptions = parent::getBrowserOptions();
|
||||
$chromeOptions->addArguments(['--accept-lang=de']);
|
||||
return $chromeOptions;
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws Facebook\WebDriver\Exception\NoSuchElementException
|
||||
* @throws Facebook\WebDriver\Exception\TimeoutException
|
||||
*/
|
||||
protected function clickAwayCookieBanner()
|
||||
{
|
||||
$this->getDriver()->wait()->until(WebDriverExpectedCondition::visibilityOfElementLocated(WebDriverBy::id('onetrust-reject-all-handler')));
|
||||
$buttonRejectCookies = $this->getDriver()->findElement(WebDriverBy::id('onetrust-reject-all-handler'));
|
||||
$buttonRejectCookies->click();
|
||||
$this->getDriver()->wait()->until(WebDriverExpectedCondition::invisibilityOfElementLocated(WebDriverBy::id('onetrust-reject-all-handler')));
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws Facebook\WebDriver\Exception\NoSuchElementException
|
||||
* @throws Facebook\WebDriver\Exception\TimeoutException
|
||||
*/
|
||||
protected function clickNextPage()
|
||||
{
|
||||
$nextPage = $this->getDriver()->findElement(WebDriverBy::xpath('//app-linkable-paginator//li[@id="next-page"]/a'));
|
||||
$href = $nextPage->getAttribute('href');
|
||||
$nextPage->click();
|
||||
$this->getDriver()->wait()->until(WebDriverExpectedCondition::not(
|
||||
WebDriverExpectedCondition::presenceOfElementLocated(
|
||||
WebDriverBy::xpath('//app-linkable-paginator//li[@id="next-page"]/a[@href="' . $href . '"]')
|
||||
)
|
||||
));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the uri of the 'Projektanbieter' logo or false if there is
|
||||
* no logo present in the item.
|
||||
*
|
||||
* @return string | false
|
||||
*/
|
||||
protected function getLogo(RemoteWebElement $item)
|
||||
{
|
||||
try {
|
||||
$logo = $item->findElement(WebDriverBy::tagName('img'))->getAttribute('src');
|
||||
if (str_starts_with($logo, 'http')) {
|
||||
// different domain
|
||||
return $logo;
|
||||
} else {
|
||||
// relative path
|
||||
$remove = substr(self::URI, strrpos(self::URI, '/') + 1);
|
||||
return substr(self::URI, 0, -strlen($remove)) . $logo;
|
||||
}
|
||||
} catch (NoSuchElementException $e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a string like "vor einigen Minuten" into a reasonable timestamp.
|
||||
* Long and complicated, but we don't want to be more specific than
|
||||
* the information we have available.
|
||||
*
|
||||
* @throws Exception If the DateInterval can't be parsed.
|
||||
*/
|
||||
protected function getTimestamp(string $timeAgo): int
|
||||
{
|
||||
$dateTime = new DateTime();
|
||||
$dateArray = explode(' ', $dateTime->format('Y m d H i s'));
|
||||
$quantityStr = explode(' ', $timeAgo)[1];
|
||||
// convert possible word into a number
|
||||
if (in_array($quantityStr, ['einem', 'einer', 'einigen'])) {
|
||||
$quantity = 1;
|
||||
} else {
|
||||
$quantity = intval($quantityStr);
|
||||
}
|
||||
// subtract time ago + inferior units for lower precision
|
||||
if (str_contains($timeAgo, 'Sekunde')) {
|
||||
$interval = new DateInterval('PT' . $quantity . 'S');
|
||||
} elseif (str_contains($timeAgo, 'Minute')) {
|
||||
$interval = new DateInterval('PT' . $quantity . 'M' . $dateArray[5] . 'S');
|
||||
} elseif (str_contains($timeAgo, 'Stunde')) {
|
||||
$interval = new DateInterval('PT' . $quantity . 'H' . $dateArray[4] . 'M' . $dateArray[5] . 'S');
|
||||
} elseif (str_contains($timeAgo, 'Tag')) {
|
||||
$interval = new DateInterval('P' . $quantity . 'DT' . $dateArray[3] . 'H' . $dateArray[4] . 'M' . $dateArray[5] . 'S');
|
||||
} else {
|
||||
throw new UnexpectedValueException($timeAgo);
|
||||
}
|
||||
$dateTime = $dateTime->sub($interval);
|
||||
return $dateTime->getTimestamp();
|
||||
}
|
||||
|
||||
/**
|
||||
* The main loop which clicks through search result pages and puts
|
||||
* the content into the $items array.
|
||||
*
|
||||
* @throws Facebook\WebDriver\Exception\NoSuchElementException
|
||||
* @throws Facebook\WebDriver\Exception\TimeoutException
|
||||
*/
|
||||
public function collectData()
|
||||
{
|
||||
parent::collectData();
|
||||
|
||||
try {
|
||||
$this->clickAwayCookieBanner();
|
||||
$this->setIcon($this->getDriver()->findElement(WebDriverBy::xpath('//link[@rel="shortcut icon"]'))->getAttribute('href'));
|
||||
|
||||
while (true) {
|
||||
$items = $this->getDriver()->findElements(WebDriverBy::tagName('app-project-view'));
|
||||
foreach ($items as $item) {
|
||||
$feedItem = new FeedItem();
|
||||
|
||||
$heading = $item->findElement(WebDriverBy::xpath('.//app-heading-tag/h1/a'));
|
||||
$feedItem->setTitle($heading->getText());
|
||||
$feedItem->setURI('https://www.gulp.de' . $heading->getAttribute('href'));
|
||||
$info = $item->findElement(WebDriverBy::tagName('app-icon-info-list'));
|
||||
if ($logo = $this->getLogo($item)) {
|
||||
$feedItem->setEnclosures([$logo]);
|
||||
}
|
||||
if (str_contains($info->getText(), 'Projektanbieter:')) {
|
||||
$feedItem->setAuthor($info->findElement(WebDriverBy::xpath('.//li/span[2]/span'))->getText());
|
||||
} else {
|
||||
// mostly "Direkt vom Auftraggeber" or "GULP Agentur"
|
||||
$feedItem->setAuthor($item->findElement(WebDriverBy::tagName('b'))->getText());
|
||||
}
|
||||
$feedItem->setContent($item->findElement(WebDriverBy::xpath('.//p[@class="description"]'))->getText());
|
||||
$timeAgo = $item->findElement(WebDriverBy::xpath('.//small[contains(@class, "time-ago")]'))->getText();
|
||||
$feedItem->setTimestamp($this->getTimestamp($timeAgo));
|
||||
|
||||
$this->items[] = $feedItem;
|
||||
}
|
||||
|
||||
if (count($this->items) < self::MAXITEMS) {
|
||||
$this->clickNextPage();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
$this->cleanUp();
|
||||
}
|
||||
}
|
||||
}
|
73
bridges/ScalableCapitalBlogBridge.php
Normal file
73
bridges/ScalableCapitalBlogBridge.php
Normal file
|
@ -0,0 +1,73 @@
|
|||
<?php
|
||||
|
||||
use Facebook\WebDriver\WebDriverBy;
|
||||
use Facebook\WebDriver\WebDriverExpectedCondition;
|
||||
|
||||
class ScalableCapitalBlogBridge extends WebDriverAbstract
|
||||
{
|
||||
const NAME = 'Scalable Capital Blog';
|
||||
const URI = 'https://de.scalable.capital/blog';
|
||||
const DESCRIPTION = 'Alle Artikel';
|
||||
const MAINTAINER = 'hleskien';
|
||||
|
||||
/**
|
||||
* Adds accept language german to the Chrome Options.
|
||||
*
|
||||
* @return Facebook\WebDriver\Chrome\ChromeOptions
|
||||
*/
|
||||
protected function getBrowserOptions()
|
||||
{
|
||||
$chromeOptions = parent::getBrowserOptions();
|
||||
$chromeOptions->addArguments(['--accept-lang=de']);
|
||||
return $chromeOptions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Puts the content of the first page into the $items array.
|
||||
*
|
||||
* @throws Facebook\WebDriver\Exception\NoSuchElementException
|
||||
* @throws Facebook\WebDriver\Exception\TimeoutException
|
||||
*/
|
||||
public function collectData()
|
||||
{
|
||||
parent::collectData();
|
||||
|
||||
try {
|
||||
// wait until last item is loaded
|
||||
$this->getDriver()->wait()->until(WebDriverExpectedCondition::visibilityOfElementLocated(
|
||||
WebDriverBy::xpath('//div[contains(@class, "articles")]//div[@class="items"]//div[contains(@class, "item")][15]')
|
||||
));
|
||||
$this->setIcon($this->getDriver()->findElement(WebDriverBy::xpath('//link[@rel="shortcut icon"]'))->getAttribute('href'));
|
||||
|
||||
$items = $this->getDriver()->findElements(WebDriverBy::xpath('//div[contains(@class, "articles")]//div[@class="items"]//div[contains(@class, "item")]'));
|
||||
foreach ($items as $item) {
|
||||
$feedItem = new FeedItem();
|
||||
|
||||
$feedItem->setEnclosures(['https://de.scalable.capital' . $item->findElement(WebDriverBy::tagName('img'))->getAttribute('src')]);
|
||||
$heading = $item->findElement(WebDriverBy::tagName('a'));
|
||||
$feedItem->setTitle($heading->getText());
|
||||
$feedItem->setURI('https://de.scalable.capital' . $heading->getAttribute('href'));
|
||||
$feedItem->setContent($item->findElement(WebDriverBy::xpath('.//div[@class="summary"]'))->getText());
|
||||
$date = $item->findElement(WebDriverBy::xpath('.//div[@class="published-date"]'))->getText();
|
||||
$feedItem->setTimestamp($this->formatItemTimestamp($date));
|
||||
$feedItem->setAuthor($item->findElement(WebDriverBy::xpath('.//div[@class="author"]'))->getText());
|
||||
|
||||
$this->items[] = $feedItem;
|
||||
}
|
||||
} finally {
|
||||
$this->cleanUp();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts the given date (dd.mm.yyyy) into a timestamp.
|
||||
*
|
||||
* @param $value string
|
||||
* @return int
|
||||
*/
|
||||
protected function formatItemTimestamp($value)
|
||||
{
|
||||
$formatter = new IntlDateFormatter('de', IntlDateFormatter::LONG, IntlDateFormatter::NONE);
|
||||
return $formatter->parse($value);
|
||||
}
|
||||
}
|
|
@ -99,6 +99,16 @@ name = "Hidden proxy name"
|
|||
; false = disabled (default)
|
||||
by_bridge = false
|
||||
|
||||
[webdriver]
|
||||
|
||||
; Sets the url of the webdriver or selenium server
|
||||
selenium_server_url = "http://localhost:4444"
|
||||
|
||||
; Sets whether the browser should run in headless mode (no visible ui)
|
||||
; true = enabled
|
||||
; false = disabled (default)
|
||||
headless = false
|
||||
|
||||
[authentication]
|
||||
|
||||
; HTTP basic authentication
|
||||
|
|
83
docs/05_Bridge_API/04_WebDriverAbstract.md
Normal file
83
docs/05_Bridge_API/04_WebDriverAbstract.md
Normal file
|
@ -0,0 +1,83 @@
|
|||
`WebDriverAbstract` extends [`BridgeAbstract`](./02_BridgeAbstract.md) and adds functionality for generating feeds
|
||||
from active websites that use XMLHttpRequest (XHR) to load content and / or JavaScript to
|
||||
modify content.
|
||||
It highly depends on the php-webdriver library which offers Selenium WebDriver bindings for PHP.
|
||||
|
||||
- https://github.com/php-webdriver/php-webdriver (Project Repository)
|
||||
- https://php-webdriver.github.io/php-webdriver/latest/ (API)
|
||||
|
||||
Please note that this class is intended as a solution for websites _that cannot be covered
|
||||
by the other classes_. The WebDriver starts a browser and is therefore very resource-intensive.
|
||||
|
||||
# Configuration
|
||||
|
||||
You need a running WebDriver to use bridges that depend on `WebDriverAbstract`.
|
||||
The easiest way is to start the Selenium server from the project of the same name:
|
||||
```
|
||||
docker run -d -p 4444:4444 --shm-size="2g" docker.io/selenium/standalone-chrome:latest
|
||||
```
|
||||
|
||||
- https://github.com/SeleniumHQ/docker-selenium
|
||||
|
||||
With these parameters only one browser window can be started at a time.
|
||||
On a multi-user site, Selenium Grid should be used
|
||||
and the number of sessions should be adjusted to the number of processor cores.
|
||||
|
||||
Finally, the `config.ini.php` file must be adjusted so that the WebDriver
|
||||
can find the Selenium server:
|
||||
```
|
||||
[webdriver]
|
||||
|
||||
selenium_server_url = "http://localhost:4444"
|
||||
```
|
||||
|
||||
# Development
|
||||
|
||||
While you are programming a new bridge, it is easier to start a local WebDriver because then you can see what is happening and where the errors are. I've also had good experience recording the process with a screen video to find any timing problems.
|
||||
|
||||
```
|
||||
chromedriver --port=4444
|
||||
```
|
||||
|
||||
- https://chromedriver.chromium.org/
|
||||
|
||||
If you start rss-bridge from a container, then Chrome driver is only accessible
|
||||
if you call it with the `--allowed-ips` option so that it binds to all network interfaces.
|
||||
|
||||
```
|
||||
chromedriver --port=4444 --allowed-ips=192.168.1.42
|
||||
```
|
||||
|
||||
The **most important rule** is that after an event such as loading the web page
|
||||
or pressing a button, you often have to explicitly wait for the desired elements to appear.
|
||||
|
||||
A simple example is the bridge `ScalableCapitalBlogBridge.php`.
|
||||
A more complex and relatively complete example is the bridge `GULPProjekteBridge.php`.
|
||||
|
||||
# Template
|
||||
|
||||
Use this template to create your own bridge.
|
||||
|
||||
```PHP
|
||||
<?php
|
||||
|
||||
class MyBridge extends WebDriverAbstract
|
||||
{
|
||||
const NAME = 'My Bridge';
|
||||
const URI = 'https://www.example.org';
|
||||
const DESCRIPTION = 'Further description';
|
||||
const MAINTAINER = 'your name';
|
||||
|
||||
public function collectData()
|
||||
{
|
||||
parent::collectData();
|
||||
|
||||
try {
|
||||
// TODO
|
||||
} finally {
|
||||
$this->cleanUp();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
```
|
|
@ -8,6 +8,7 @@ Base class | Description
|
|||
-----------|------------
|
||||
[`BridgeAbstract`](./02_BridgeAbstract.md) | This class is intended for standard _Bridges_ that need to filter HTML pages for content.
|
||||
[`FeedExpander`](./03_FeedExpander.md) | Expand/modify existing feed urls
|
||||
[`XPathAbstract`](./04_XPathAbstract.md) | This class is meant as an alternative base class for bridge implementations. It offers preliminary functionality for generating feeds based on _XPath expressions_.
|
||||
[`WebDriverAbstract`](./04_WebDriverAbstract) |
|
||||
[`XPathAbstract`](./05_XPathAbstract) | This class is meant as an alternative base class for bridge implementations. It offers preliminary functionality for generating feeds based on _XPath expressions_.
|
||||
|
||||
For more information about how to create a new _Bridge_, read [How to create a new Bridge?](./01_How_to_create_a_new_bridge.md)
|
141
lib/WebDriverAbstract.php
Normal file
141
lib/WebDriverAbstract.php
Normal file
|
@ -0,0 +1,141 @@
|
|||
<?php
|
||||
|
||||
use Facebook\WebDriver\Chrome\ChromeOptions;
|
||||
use Facebook\WebDriver\Remote\DesiredCapabilities;
|
||||
use Facebook\WebDriver\Remote\RemoteWebDriver;
|
||||
use Facebook\WebDriver\WebDriverCapabilities;
|
||||
|
||||
/**
|
||||
* An alternative abstract class for bridges depending on webdriver
|
||||
*
|
||||
* This class is meant a solution for active websites that use
|
||||
* XMLHttpRequest (XHR) to load content and/or use JavaScript to
|
||||
* change content. This class depends on a working webdriver setup.
|
||||
*/
|
||||
abstract class WebDriverAbstract extends BridgeAbstract
|
||||
{
|
||||
/**
|
||||
* Holds the remote webdriver object, including configuration and
|
||||
* connection.
|
||||
*
|
||||
* @var RemoteWebDriver
|
||||
*/
|
||||
protected RemoteWebDriver $driver;
|
||||
|
||||
/**
|
||||
* Holds the uri of the feed's icon.
|
||||
*
|
||||
* @var string | null
|
||||
*/
|
||||
private $feedIcon;
|
||||
|
||||
/**
|
||||
* Returns the webdriver object.
|
||||
*
|
||||
* @return RemoteWebDriver
|
||||
*/
|
||||
protected function getDriver(): RemoteWebDriver
|
||||
{
|
||||
return $this->driver;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the uri of the feed's icon.
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public function getIcon()
|
||||
{
|
||||
return $this->feedIcon ?: parent::getIcon();
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the uri of the feed's icon.
|
||||
*
|
||||
* @param $iconurl string
|
||||
*/
|
||||
protected function setIcon($iconurl)
|
||||
{
|
||||
$this->feedIcon = $iconurl;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the ChromeOptions object.
|
||||
*
|
||||
* If the configuration parameter 'headless' is set to true, the
|
||||
* argument '--headless' is added. Override this to change or add
|
||||
* more options.
|
||||
*
|
||||
* @return ChromeOptions
|
||||
*/
|
||||
protected function getBrowserOptions()
|
||||
{
|
||||
$chromeOptions = new ChromeOptions();
|
||||
if (Configuration::getConfig('webdriver', 'headless')) {
|
||||
$chromeOptions->addArguments(['--headless']); // --window-size=1024,1024
|
||||
}
|
||||
return $chromeOptions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the DesiredCapabilities object for the Chrome browser.
|
||||
*
|
||||
* The Chrome options are added. Override this to change or add
|
||||
* more capabilities.
|
||||
*
|
||||
* @return WebDriverCapabilities
|
||||
*/
|
||||
protected function getDesiredCapabilities(): WebDriverCapabilities
|
||||
{
|
||||
$desiredCapabilities = DesiredCapabilities::chrome();
|
||||
$desiredCapabilities->setCapability(ChromeOptions::CAPABILITY, $this->getBrowserOptions());
|
||||
return $desiredCapabilities;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs the remote webdriver with the url of the remote (Selenium)
|
||||
* webdriver server and the desired capabilities.
|
||||
*
|
||||
* This should be called in collectData() first.
|
||||
*/
|
||||
protected function prepareWebDriver()
|
||||
{
|
||||
$server = Configuration::getConfig('webdriver', 'selenium_server_url');
|
||||
$this->driver = RemoteWebDriver::create($server, $this->getDesiredCapabilities());
|
||||
}
|
||||
|
||||
/**
|
||||
* Maximizes the remote browser window (often important for reactive sites
|
||||
* which change their appearance depending on the window size) and opens
|
||||
* the uri set in the constant URI.
|
||||
*/
|
||||
protected function prepareWindow()
|
||||
{
|
||||
$this->getDriver()->manage()->window()->maximize();
|
||||
$this->getDriver()->get($this->getURI());
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes the remote browser window and shuts down the remote webdriver
|
||||
* connection.
|
||||
*
|
||||
* This must be called at the end of scraping, for example within a
|
||||
* 'finally' block.
|
||||
*/
|
||||
protected function cleanUp()
|
||||
{
|
||||
$this->getDriver()->quit();
|
||||
}
|
||||
|
||||
/**
|
||||
* Do your web scraping here and fill the $items array.
|
||||
*
|
||||
* Override this but call parent() first.
|
||||
* Don't forget to call cleanUp() at the end.
|
||||
*/
|
||||
public function collectData()
|
||||
{
|
||||
$this->prepareWebDriver();
|
||||
$this->prepareWindow();
|
||||
}
|
||||
}
|
Loading…
Add table
Reference in a new issue