diff --git a/bridges/GULPProjekteBridge.php b/bridges/GULPProjekteBridge.php new file mode 100644 index 00000000..e0bb8cbe --- /dev/null +++ b/bridges/GULPProjekteBridge.php @@ -0,0 +1,164 @@ +addArguments(['--accept-lang=de']); + return $chromeOptions; + } + + /** + * @throws Facebook\WebDriver\Exception\NoSuchElementException + * @throws Facebook\WebDriver\Exception\TimeoutException + */ + protected function clickAwayCookieBanner() + { + $this->getDriver()->wait()->until(WebDriverExpectedCondition::visibilityOfElementLocated(WebDriverBy::id('onetrust-reject-all-handler'))); + $buttonRejectCookies = $this->getDriver()->findElement(WebDriverBy::id('onetrust-reject-all-handler')); + $buttonRejectCookies->click(); + $this->getDriver()->wait()->until(WebDriverExpectedCondition::invisibilityOfElementLocated(WebDriverBy::id('onetrust-reject-all-handler'))); + } + + /** + * @throws Facebook\WebDriver\Exception\NoSuchElementException + * @throws Facebook\WebDriver\Exception\TimeoutException + */ + protected function clickNextPage() + { + $nextPage = $this->getDriver()->findElement(WebDriverBy::xpath('//app-linkable-paginator//li[@id="next-page"]/a')); + $href = $nextPage->getAttribute('href'); + $nextPage->click(); + $this->getDriver()->wait()->until(WebDriverExpectedCondition::not( + WebDriverExpectedCondition::presenceOfElementLocated( + WebDriverBy::xpath('//app-linkable-paginator//li[@id="next-page"]/a[@href="' . $href . '"]') + ) + )); + } + + /** + * Returns the uri of the 'Projektanbieter' logo or false if there is + * no logo present in the item. + * + * @return string | false + */ + protected function getLogo(RemoteWebElement $item) + { + try { + $logo = $item->findElement(WebDriverBy::tagName('img'))->getAttribute('src'); + if (str_starts_with($logo, 'http')) { + // different domain + return $logo; + } else { + // relative path + $remove = substr(self::URI, strrpos(self::URI, '/') + 1); + return substr(self::URI, 0, -strlen($remove)) . $logo; + } + } catch (NoSuchElementException $e) { + return false; + } + } + + /** + * Converts a string like "vor einigen Minuten" into a reasonable timestamp. + * Long and complicated, but we don't want to be more specific than + * the information we have available. + * + * @throws Exception If the DateInterval can't be parsed. + */ + protected function getTimestamp(string $timeAgo): int + { + $dateTime = new DateTime(); + $dateArray = explode(' ', $dateTime->format('Y m d H i s')); + $quantityStr = explode(' ', $timeAgo)[1]; + // convert possible word into a number + if (in_array($quantityStr, ['einem', 'einer', 'einigen'])) { + $quantity = 1; + } else { + $quantity = intval($quantityStr); + } + // subtract time ago + inferior units for lower precision + if (str_contains($timeAgo, 'Sekunde')) { + $interval = new DateInterval('PT' . $quantity . 'S'); + } elseif (str_contains($timeAgo, 'Minute')) { + $interval = new DateInterval('PT' . $quantity . 'M' . $dateArray[5] . 'S'); + } elseif (str_contains($timeAgo, 'Stunde')) { + $interval = new DateInterval('PT' . $quantity . 'H' . $dateArray[4] . 'M' . $dateArray[5] . 'S'); + } elseif (str_contains($timeAgo, 'Tag')) { + $interval = new DateInterval('P' . $quantity . 'DT' . $dateArray[3] . 'H' . $dateArray[4] . 'M' . $dateArray[5] . 'S'); + } else { + throw new UnexpectedValueException($timeAgo); + } + $dateTime = $dateTime->sub($interval); + return $dateTime->getTimestamp(); + } + + /** + * The main loop which clicks through search result pages and puts + * the content into the $items array. + * + * @throws Facebook\WebDriver\Exception\NoSuchElementException + * @throws Facebook\WebDriver\Exception\TimeoutException + */ + public function collectData() + { + parent::collectData(); + + try { + $this->clickAwayCookieBanner(); + $this->setIcon($this->getDriver()->findElement(WebDriverBy::xpath('//link[@rel="shortcut icon"]'))->getAttribute('href')); + + while (true) { + $items = $this->getDriver()->findElements(WebDriverBy::tagName('app-project-view')); + foreach ($items as $item) { + $feedItem = new FeedItem(); + + $heading = $item->findElement(WebDriverBy::xpath('.//app-heading-tag/h1/a')); + $feedItem->setTitle($heading->getText()); + $feedItem->setURI('https://www.gulp.de' . $heading->getAttribute('href')); + $info = $item->findElement(WebDriverBy::tagName('app-icon-info-list')); + if ($logo = $this->getLogo($item)) { + $feedItem->setEnclosures([$logo]); + } + if (str_contains($info->getText(), 'Projektanbieter:')) { + $feedItem->setAuthor($info->findElement(WebDriverBy::xpath('.//li/span[2]/span'))->getText()); + } else { + // mostly "Direkt vom Auftraggeber" or "GULP Agentur" + $feedItem->setAuthor($item->findElement(WebDriverBy::tagName('b'))->getText()); + } + $feedItem->setContent($item->findElement(WebDriverBy::xpath('.//p[@class="description"]'))->getText()); + $timeAgo = $item->findElement(WebDriverBy::xpath('.//small[contains(@class, "time-ago")]'))->getText(); + $feedItem->setTimestamp($this->getTimestamp($timeAgo)); + + $this->items[] = $feedItem; + } + + if (count($this->items) < self::MAXITEMS) { + $this->clickNextPage(); + } else { + break; + } + } + } finally { + $this->cleanUp(); + } + } +} diff --git a/bridges/ScalableCapitalBlogBridge.php b/bridges/ScalableCapitalBlogBridge.php new file mode 100644 index 00000000..6f95efb3 --- /dev/null +++ b/bridges/ScalableCapitalBlogBridge.php @@ -0,0 +1,73 @@ +addArguments(['--accept-lang=de']); + return $chromeOptions; + } + + /** + * Puts the content of the first page into the $items array. + * + * @throws Facebook\WebDriver\Exception\NoSuchElementException + * @throws Facebook\WebDriver\Exception\TimeoutException + */ + public function collectData() + { + parent::collectData(); + + try { + // wait until last item is loaded + $this->getDriver()->wait()->until(WebDriverExpectedCondition::visibilityOfElementLocated( + WebDriverBy::xpath('//div[contains(@class, "articles")]//div[@class="items"]//div[contains(@class, "item")][15]') + )); + $this->setIcon($this->getDriver()->findElement(WebDriverBy::xpath('//link[@rel="shortcut icon"]'))->getAttribute('href')); + + $items = $this->getDriver()->findElements(WebDriverBy::xpath('//div[contains(@class, "articles")]//div[@class="items"]//div[contains(@class, "item")]')); + foreach ($items as $item) { + $feedItem = new FeedItem(); + + $feedItem->setEnclosures(['https://de.scalable.capital' . $item->findElement(WebDriverBy::tagName('img'))->getAttribute('src')]); + $heading = $item->findElement(WebDriverBy::tagName('a')); + $feedItem->setTitle($heading->getText()); + $feedItem->setURI('https://de.scalable.capital' . $heading->getAttribute('href')); + $feedItem->setContent($item->findElement(WebDriverBy::xpath('.//div[@class="summary"]'))->getText()); + $date = $item->findElement(WebDriverBy::xpath('.//div[@class="published-date"]'))->getText(); + $feedItem->setTimestamp($this->formatItemTimestamp($date)); + $feedItem->setAuthor($item->findElement(WebDriverBy::xpath('.//div[@class="author"]'))->getText()); + + $this->items[] = $feedItem; + } + } finally { + $this->cleanUp(); + } + } + + /** + * Converts the given date (dd.mm.yyyy) into a timestamp. + * + * @param $value string + * @return int + */ + protected function formatItemTimestamp($value) + { + $formatter = new IntlDateFormatter('de', IntlDateFormatter::LONG, IntlDateFormatter::NONE); + return $formatter->parse($value); + } +} \ No newline at end of file diff --git a/config.default.ini.php b/config.default.ini.php index 7729afcb..8f7de832 100644 --- a/config.default.ini.php +++ b/config.default.ini.php @@ -99,6 +99,16 @@ name = "Hidden proxy name" ; false = disabled (default) by_bridge = false +[webdriver] + +; Sets the url of the webdriver or selenium server +selenium_server_url = "http://localhost:4444" + +; Sets whether the browser should run in headless mode (no visible ui) +; true = enabled +; false = disabled (default) +headless = false + [authentication] ; HTTP basic authentication diff --git a/docs/05_Bridge_API/04_WebDriverAbstract.md b/docs/05_Bridge_API/04_WebDriverAbstract.md new file mode 100644 index 00000000..60b5e99d --- /dev/null +++ b/docs/05_Bridge_API/04_WebDriverAbstract.md @@ -0,0 +1,83 @@ +`WebDriverAbstract` extends [`BridgeAbstract`](./02_BridgeAbstract.md) and adds functionality for generating feeds +from active websites that use XMLHttpRequest (XHR) to load content and / or JavaScript to +modify content. +It highly depends on the php-webdriver library which offers Selenium WebDriver bindings for PHP. + +- https://github.com/php-webdriver/php-webdriver (Project Repository) +- https://php-webdriver.github.io/php-webdriver/latest/ (API) + +Please note that this class is intended as a solution for websites _that cannot be covered +by the other classes_. The WebDriver starts a browser and is therefore very resource-intensive. + +# Configuration + +You need a running WebDriver to use bridges that depend on `WebDriverAbstract`. +The easiest way is to start the Selenium server from the project of the same name: +``` +docker run -d -p 4444:4444 --shm-size="2g" docker.io/selenium/standalone-chrome:latest +``` + +- https://github.com/SeleniumHQ/docker-selenium + +With these parameters only one browser window can be started at a time. +On a multi-user site, Selenium Grid should be used +and the number of sessions should be adjusted to the number of processor cores. + +Finally, the `config.ini.php` file must be adjusted so that the WebDriver +can find the Selenium server: +``` +[webdriver] + +selenium_server_url = "http://localhost:4444" +``` + +# Development + +While you are programming a new bridge, it is easier to start a local WebDriver because then you can see what is happening and where the errors are. I've also had good experience recording the process with a screen video to find any timing problems. + +``` +chromedriver --port=4444 +``` + +- https://chromedriver.chromium.org/ + +If you start rss-bridge from a container, then Chrome driver is only accessible +if you call it with the `--allowed-ips` option so that it binds to all network interfaces. + +``` +chromedriver --port=4444 --allowed-ips=192.168.1.42 +``` + +The **most important rule** is that after an event such as loading the web page +or pressing a button, you often have to explicitly wait for the desired elements to appear. + +A simple example is the bridge `ScalableCapitalBlogBridge.php`. +A more complex and relatively complete example is the bridge `GULPProjekteBridge.php`. + +# Template + +Use this template to create your own bridge. + +```PHP +cleanUp(); + } + } +} + +``` \ No newline at end of file diff --git a/docs/05_Bridge_API/04_XPathAbstract.md b/docs/05_Bridge_API/05_XPathAbstract.md similarity index 100% rename from docs/05_Bridge_API/04_XPathAbstract.md rename to docs/05_Bridge_API/05_XPathAbstract.md diff --git a/docs/05_Bridge_API/index.md b/docs/05_Bridge_API/index.md index 06445246..ea6fd315 100644 --- a/docs/05_Bridge_API/index.md +++ b/docs/05_Bridge_API/index.md @@ -8,6 +8,7 @@ Base class | Description -----------|------------ [`BridgeAbstract`](./02_BridgeAbstract.md) | This class is intended for standard _Bridges_ that need to filter HTML pages for content. [`FeedExpander`](./03_FeedExpander.md) | Expand/modify existing feed urls -[`XPathAbstract`](./04_XPathAbstract.md) | This class is meant as an alternative base class for bridge implementations. It offers preliminary functionality for generating feeds based on _XPath expressions_. +[`WebDriverAbstract`](./04_WebDriverAbstract) | +[`XPathAbstract`](./05_XPathAbstract) | This class is meant as an alternative base class for bridge implementations. It offers preliminary functionality for generating feeds based on _XPath expressions_. For more information about how to create a new _Bridge_, read [How to create a new Bridge?](./01_How_to_create_a_new_bridge.md) \ No newline at end of file diff --git a/lib/WebDriverAbstract.php b/lib/WebDriverAbstract.php new file mode 100644 index 00000000..db2fb7b1 --- /dev/null +++ b/lib/WebDriverAbstract.php @@ -0,0 +1,141 @@ +driver; + } + + /** + * Returns the uri of the feed's icon. + * + * @return string + */ + public function getIcon() + { + return $this->feedIcon ?: parent::getIcon(); + } + + /** + * Sets the uri of the feed's icon. + * + * @param $iconurl string + */ + protected function setIcon($iconurl) + { + $this->feedIcon = $iconurl; + } + + /** + * Returns the ChromeOptions object. + * + * If the configuration parameter 'headless' is set to true, the + * argument '--headless' is added. Override this to change or add + * more options. + * + * @return ChromeOptions + */ + protected function getBrowserOptions() + { + $chromeOptions = new ChromeOptions(); + if (Configuration::getConfig('webdriver', 'headless')) { + $chromeOptions->addArguments(['--headless']); // --window-size=1024,1024 + } + return $chromeOptions; + } + + /** + * Returns the DesiredCapabilities object for the Chrome browser. + * + * The Chrome options are added. Override this to change or add + * more capabilities. + * + * @return WebDriverCapabilities + */ + protected function getDesiredCapabilities(): WebDriverCapabilities + { + $desiredCapabilities = DesiredCapabilities::chrome(); + $desiredCapabilities->setCapability(ChromeOptions::CAPABILITY, $this->getBrowserOptions()); + return $desiredCapabilities; + } + + /** + * Constructs the remote webdriver with the url of the remote (Selenium) + * webdriver server and the desired capabilities. + * + * This should be called in collectData() first. + */ + protected function prepareWebDriver() + { + $server = Configuration::getConfig('webdriver', 'selenium_server_url'); + $this->driver = RemoteWebDriver::create($server, $this->getDesiredCapabilities()); + } + + /** + * Maximizes the remote browser window (often important for reactive sites + * which change their appearance depending on the window size) and opens + * the uri set in the constant URI. + */ + protected function prepareWindow() + { + $this->getDriver()->manage()->window()->maximize(); + $this->getDriver()->get($this->getURI()); + } + + /** + * Closes the remote browser window and shuts down the remote webdriver + * connection. + * + * This must be called at the end of scraping, for example within a + * 'finally' block. + */ + protected function cleanUp() + { + $this->getDriver()->quit(); + } + + /** + * Do your web scraping here and fill the $items array. + * + * Override this but call parent() first. + * Don't forget to call cleanUp() at the end. + */ + public function collectData() + { + $this->prepareWebDriver(); + $this->prepareWindow(); + } +} \ No newline at end of file