From b017f75767766193dee445a760f99ae08b78cacf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dawid=20Wr=C3=B3bel?= Date: Thu, 17 Nov 2022 17:57:05 +0100 Subject: [PATCH] [OLXBridge] new bridge (#2944) * [OLXBridge] new bridge * [OLXBridge] option to limit to shipping offers only * [OLXBridge] set the feed title according to search query * [OLXBridge] Fix PHP notices * [OLXBridge] Remove trailing slash from the URL * [OLXBridge] filter out the imposed additional search categories * [OLXBridge] limit search to 'new' OLX platform variants * [OLXBridge] Parse date, add ID, description Deep-crawl all results. Penalty is low, as we were doing this for almost all of the results, anyway, yet it allows to obtain a unique ID, an uncomplicated Date string and a description. Requires ext-intl for parsing the date according to locale. * [OLXBridge] Parse date, add ID, description Deep-crawl all results. Penalty is low, as we were doing this for almost all of the results, anyway, yet it allows to obtain a unique ID, an uncomplicated Date string and a description. Requires ext-intl for parsing the date according to locale. * [OLXBridge] Images are optional, handle appropriately * [OLXBridge] handle the ID coming from sibling auto-moto portal * [OLXBridge] handle the photos coming from sibling auto-moto portal * [OLXBridge] use meta property to find img URL * [OLXBridge] handle the date coming from sibling auto-moto portal * [OLXBridge] use simplified syntax to retrieve content attribute value * [OLXBridge] handle the description coming from sibling auto-moto portal * [OLXBridge] fix phpcs complaints * [OLXBridge] add categories * [OLXBridge] handle the categories coming from sibling auto-moto portal * [OLXBridge] hint image MIME type OLX images have no obvious extension * [OLXBridge] Fix content formatting * [OLXBridge] URL is pattern-checked, so no need to check again * [OLXBridge] return actual search query as URI --- Dockerfile | 5 +- bridges/OLXBridge.php | 204 ++++++++++++++++++++++++++++++++++++++++++ composer.json | 3 +- 3 files changed, 210 insertions(+), 2 deletions(-) create mode 100644 bridges/OLXBridge.php diff --git a/Dockerfile b/Dockerfile index 4688a955..dabcd97f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,8 +12,11 @@ RUN apt-get update && \ zlib1g-dev \ libzip-dev \ libmemcached-dev \ - nss-plugin-pem && \ + nss-plugin-pem \ + libicu-dev && \ docker-php-ext-install zip && \ + docker-php-ext-install zip && \ + docker-php-ext-install intl && \ pecl install memcached && \ docker-php-ext-enable memcached && \ docker-php-ext-enable opcache && \ diff --git a/bridges/OLXBridge.php b/bridges/OLXBridge.php new file mode 100644 index 00000000..2fc19641 --- /dev/null +++ b/bridges/OLXBridge.php @@ -0,0 +1,204 @@ + [ + 'name' => 'Search URL', + 'title' => 'Copy the URL from your browser\'s address bar after searching for your items and paste it here', + 'pattern' => '^(https:\/\/)?(www.)?olx\.(bg|kz|pl|pt|ro|ua|uz).*$', + 'exampleValue' => 'https://www.olx.pl/d/oferty/q-cebula/', + 'required' => true, + ], + 'includePostsWithoutPricetag' => [ + 'type' => 'checkbox', + 'name' => 'Include posts without price tag' + ], + 'includeFeaturedPosts' => [ + 'type' => 'checkbox', + 'name' => 'Include featured posts' + ], + 'shippingOfferedOnly' => [ + 'type' => 'checkbox', + 'name' => 'Only posts with shipping offered' + ] + ]]; + + private function getHostname() + { + $scheme = parse_url($this->getInput('url'), PHP_URL_SCHEME); + $host = parse_url($this->getInput('url'), PHP_URL_HOST); + + return $scheme . '://' . $host; + } + + public function getURI() + { + if ($this->getInput('url')) { + # make sure we order by the most recently listed offers + $uri = trim(preg_replace('/([?&])search%5Border%5D=[^&]+(&|$)/', '$1', $this->getInput('url')), '?&/'); + $uri = preg_replace('/([?&])view=[^&]+(&|$)/', '', $uri); + $uri .= (parse_url($uri, PHP_URL_QUERY) ? '&' : '?') . 'search%5Border%5D=created_at:desc'; + + return $uri; + } else { + return parent::getURI(); + } + } + + public function getName() + { + $paths = explode('/', parse_url($this->getInput('url'), PHP_URL_PATH)); + + $query = array_reduce($paths, function ($q, $p) { + if (preg_match('/^q-(.+)$/i', $p, $matches)) { + $q[] = str_replace('-', ' ', urldecode($matches[1])); + } + + return $q; + }); + + if ($query) { + return $query[0]; + } + + return parent::getName(); + } + + public function collectData() + { + $html = getSimpleHTMLDOM($this->getURI()); + $html = defaultLinkTo($html, $this->getHostname()); + + $isoLang = $html->find('meta[http-equiv=Content-Language]', 0)->content; + + # the second grid, if any, has extended results from additional categories, outside of original search scope + $listing_grid = $html->find("div[data-testid='listing-grid']", 0); + + $results = $listing_grid->find("div[data-cy='l-card']"); + + foreach ($results as $post) { + $item = []; + + if (!$this->getInput('includeFeaturedPosts') && $post->find('div[data-testid="adCard-featured"]', 0)) { + continue; + } + + $price = $post->find('p[data-testid="ad-price"]', 0)->plaintext ?? ''; + if (!$this->getInput('includePostsWithoutPricetag') && !$price) { + continue; + } + + $shippingOffered = $post->find('.css-1c0ed4l svg', 0)->outertext ?? false; + if ($this->getInput('shippingOfferedOnly') && !$shippingOffered) { + continue; + } + + $negotiable = $post->find('p[data-testid="ad-price"] span.css-e2218f', 0)->plaintext ?? false; + if ($negotiable) { + $price = trim(str_replace($negotiable, '', $price)); + $negotiable = '(' . $negotiable . ')'; + } + + if ($post->find('h6', 0)->plaintext != '') { + $item['uri'] = $post->find('a', 0)->href; + $item['title'] = $post->find('h6', 0)->plaintext; + } + + # ignore the date component, as it is too convoluted — use the deep-crawled one; see below + $locationAndDate = $post->find('p[data-testid="location-date"]', 0)->plaintext; + $locationAndDateArray = explode(' - ', $locationAndDate, 2); + $location = trim($locationAndDateArray[0]); + + # OLX only shows 5 results before images get lazy-loaded, so we have to deep-crawl *almost* all the results. + # Given that, do deep-crawl *all* the results, which allows to aso obtain the ID, the simplified location + # and date strings, as well as the detailed description. + $articleHTMLContent = getSimpleHTMLDOMCached($item['uri']); + + # Extract a clean ID without resorting to the convoluted CSS class or sibling selectors. Should be always present. + $refreshLink = $articleHTMLContent->find('a[data-testid=refresh-link]', 0)->href ?? false; + if ($refreshLink) { + parse_str(parse_url($refreshLink, PHP_URL_QUERY), $refreshQuery); + $item['uid'] = $refreshQuery['ad-id']; + } else { + # may be an imported offer from a sibling auto-moto classifieds platform + $item['uid'] = $articleHTMLContent->find('span[id=ad_id]', 0)->plaintext; + } + + $img = $articleHTMLContent->find('meta[property="og:image"]', 0)->content ?? false; + if ($img) { + $item['enclosures'] = [$img . '#.image']; + } + + $isoDate = $articleHTMLContent->find('meta[property="og:updated_time"]', 0)->content ?? false; + if ($isoDate) { + $item['timestamp'] = strtotime($isoDate); + } else { + $date = $articleHTMLContent->find('span[data-cy="ad-posted-at"]', 0)->plaintext; + # Relative, today + if (preg_match('/^.*\s(\d\d:\d\d)$/i', $date, $matches)) { + $item['timestamp'] = strtotime($matches[1]); + } else { + # full, localized date + $formatter = new IntlDateFormatter($isoLang, IntlDateFormatter::SHORT, IntlDateFormatter::NONE); + $item['timestamp'] = $formatter->parse($date); + } + } + + $descriptionHtml = $articleHTMLContent->find('div[data-cy="ad_description"] div', 0)->innertext ?? false; + if (!$descriptionHtml) { + $descriptionHtml = $articleHTMLContent->find('div[id="description"] div[data-read-more]', 0)->innertext ?? false; + } + + $item['categories'] = []; + $breadcrumbs = $articleHTMLContent->find('li[data-testid="breadcrumb-item"]'); + foreach ($breadcrumbs as $breadcrumb) { + $category = $breadcrumb->find('a[href!="/"]', 0) ?? false; + + if ($category) { + $item['categories'][] = $category->plaintext; + } + } + + $parameters = $articleHTMLContent->find('div.parametersArea li'); + foreach ($parameters as $parameter) { + $category = $parameter->find('a', 0)->plaintext ?? false; + + if ($category = empty($category) ? false : trim($category)) { + if ($category == 'Tak') { + $category = $parameter->find('span', 0)->plaintext ?? ''; + } elseif ($category == 'Nie') { + continue; + } + + $item['categories'][] = $category; + } + } + + $item['content'] = << + + + +

$location

+

$price $negotiable $shippingOffered

+ + + + $descriptionHtml + + + +CONTENT; + $this->items[] = $item; + } + } +} diff --git a/composer.json b/composer.json index 2c0c5038..a08c9666 100644 --- a/composer.json +++ b/composer.json @@ -28,7 +28,8 @@ "ext-openssl": "*", "ext-libxml": "*", "ext-simplexml": "*", - "ext-json": "*" + "ext-json": "*", + "ext-intl": "*" }, "require-dev": { "phpunit/phpunit": "^9",