[OLXBridge] new bridge (#2944)

* [OLXBridge] new bridge

* [OLXBridge] option to limit to shipping offers only

* [OLXBridge] set the feed title according to search query

* [OLXBridge] Fix PHP notices

* [OLXBridge] Remove trailing slash from the URL

* [OLXBridge] filter out the imposed additional search categories

* [OLXBridge] limit search to 'new' OLX platform variants

* [OLXBridge] Parse date, add ID, description

Deep-crawl all results. Penalty is low, as we were doing this for almost all of the results, anyway, yet it allows to obtain a unique ID, an uncomplicated Date string and a description.

Requires ext-intl for parsing the date according to locale.

* [OLXBridge] Parse date, add ID, description

Deep-crawl all results. Penalty is low, as we were doing this for almost all of the results, anyway, yet it allows to obtain a unique ID, an uncomplicated Date string and a description.

Requires ext-intl for parsing the date according to locale.

* [OLXBridge] Images are optional, handle appropriately

* [OLXBridge] handle the ID coming from sibling auto-moto portal

* [OLXBridge] handle the photos coming from sibling auto-moto portal

* [OLXBridge] use meta property to find img URL

* [OLXBridge] handle the date coming from sibling auto-moto portal

* [OLXBridge] use simplified syntax to retrieve content attribute value

* [OLXBridge] handle the description coming from sibling auto-moto portal

* [OLXBridge] fix phpcs complaints

* [OLXBridge] add categories

* [OLXBridge] handle the categories coming from sibling auto-moto portal

* [OLXBridge] hint image MIME type

OLX images have no obvious extension

* [OLXBridge] Fix content formatting

* [OLXBridge] URL is pattern-checked, so no need to check again

* [OLXBridge] return actual search query as URI
This commit is contained in:
Dawid Wróbel 2022-11-17 17:57:05 +01:00 committed by GitHub
parent 0726cce426
commit b017f75767
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 210 additions and 2 deletions

View file

@ -12,8 +12,11 @@ RUN apt-get update && \
zlib1g-dev \
libzip-dev \
libmemcached-dev \
nss-plugin-pem && \
nss-plugin-pem \
libicu-dev && \
docker-php-ext-install zip && \
docker-php-ext-install zip && \
docker-php-ext-install intl && \
pecl install memcached && \
docker-php-ext-enable memcached && \
docker-php-ext-enable opcache && \

204
bridges/OLXBridge.php Normal file
View file

@ -0,0 +1,204 @@
<?php
class OLXBridge extends BridgeAbstract
{
const NAME = 'OLX';
const DESCRIPTION = <<<'EOF'
Returns the search results from the OLX auctioning platforms
(Bulgaria, Kazakhstan, Poland, Portugal, Romania, Ukraine and Uzbekistan only)
EOF;
const URI = 'https://www.olx.com';
const MAINTAINER = 'wrobelda';
const PARAMETERS = [[
'url' => [
'name' => 'Search URL',
'title' => 'Copy the URL from your browser\'s address bar after searching for your items and paste it here',
'pattern' => '^(https:\/\/)?(www.)?olx\.(bg|kz|pl|pt|ro|ua|uz).*$',
'exampleValue' => 'https://www.olx.pl/d/oferty/q-cebula/',
'required' => true,
],
'includePostsWithoutPricetag' => [
'type' => 'checkbox',
'name' => 'Include posts without price tag'
],
'includeFeaturedPosts' => [
'type' => 'checkbox',
'name' => 'Include featured posts'
],
'shippingOfferedOnly' => [
'type' => 'checkbox',
'name' => 'Only posts with shipping offered'
]
]];
private function getHostname()
{
$scheme = parse_url($this->getInput('url'), PHP_URL_SCHEME);
$host = parse_url($this->getInput('url'), PHP_URL_HOST);
return $scheme . '://' . $host;
}
public function getURI()
{
if ($this->getInput('url')) {
# make sure we order by the most recently listed offers
$uri = trim(preg_replace('/([?&])search%5Border%5D=[^&]+(&|$)/', '$1', $this->getInput('url')), '?&/');
$uri = preg_replace('/([?&])view=[^&]+(&|$)/', '', $uri);
$uri .= (parse_url($uri, PHP_URL_QUERY) ? '&' : '?') . 'search%5Border%5D=created_at:desc';
return $uri;
} else {
return parent::getURI();
}
}
public function getName()
{
$paths = explode('/', parse_url($this->getInput('url'), PHP_URL_PATH));
$query = array_reduce($paths, function ($q, $p) {
if (preg_match('/^q-(.+)$/i', $p, $matches)) {
$q[] = str_replace('-', ' ', urldecode($matches[1]));
}
return $q;
});
if ($query) {
return $query[0];
}
return parent::getName();
}
public function collectData()
{
$html = getSimpleHTMLDOM($this->getURI());
$html = defaultLinkTo($html, $this->getHostname());
$isoLang = $html->find('meta[http-equiv=Content-Language]', 0)->content;
# the second grid, if any, has extended results from additional categories, outside of original search scope
$listing_grid = $html->find("div[data-testid='listing-grid']", 0);
$results = $listing_grid->find("div[data-cy='l-card']");
foreach ($results as $post) {
$item = [];
if (!$this->getInput('includeFeaturedPosts') && $post->find('div[data-testid="adCard-featured"]', 0)) {
continue;
}
$price = $post->find('p[data-testid="ad-price"]', 0)->plaintext ?? '';
if (!$this->getInput('includePostsWithoutPricetag') && !$price) {
continue;
}
$shippingOffered = $post->find('.css-1c0ed4l svg', 0)->outertext ?? false;
if ($this->getInput('shippingOfferedOnly') && !$shippingOffered) {
continue;
}
$negotiable = $post->find('p[data-testid="ad-price"] span.css-e2218f', 0)->plaintext ?? false;
if ($negotiable) {
$price = trim(str_replace($negotiable, '', $price));
$negotiable = '(' . $negotiable . ')';
}
if ($post->find('h6', 0)->plaintext != '') {
$item['uri'] = $post->find('a', 0)->href;
$item['title'] = $post->find('h6', 0)->plaintext;
}
# ignore the date component, as it is too convoluted — use the deep-crawled one; see below
$locationAndDate = $post->find('p[data-testid="location-date"]', 0)->plaintext;
$locationAndDateArray = explode(' - ', $locationAndDate, 2);
$location = trim($locationAndDateArray[0]);
# OLX only shows 5 results before images get lazy-loaded, so we have to deep-crawl *almost* all the results.
# Given that, do deep-crawl *all* the results, which allows to aso obtain the ID, the simplified location
# and date strings, as well as the detailed description.
$articleHTMLContent = getSimpleHTMLDOMCached($item['uri']);
# Extract a clean ID without resorting to the convoluted CSS class or sibling selectors. Should be always present.
$refreshLink = $articleHTMLContent->find('a[data-testid=refresh-link]', 0)->href ?? false;
if ($refreshLink) {
parse_str(parse_url($refreshLink, PHP_URL_QUERY), $refreshQuery);
$item['uid'] = $refreshQuery['ad-id'];
} else {
# may be an imported offer from a sibling auto-moto classifieds platform
$item['uid'] = $articleHTMLContent->find('span[id=ad_id]', 0)->plaintext;
}
$img = $articleHTMLContent->find('meta[property="og:image"]', 0)->content ?? false;
if ($img) {
$item['enclosures'] = [$img . '#.image'];
}
$isoDate = $articleHTMLContent->find('meta[property="og:updated_time"]', 0)->content ?? false;
if ($isoDate) {
$item['timestamp'] = strtotime($isoDate);
} else {
$date = $articleHTMLContent->find('span[data-cy="ad-posted-at"]', 0)->plaintext;
# Relative, today
if (preg_match('/^.*\s(\d\d:\d\d)$/i', $date, $matches)) {
$item['timestamp'] = strtotime($matches[1]);
} else {
# full, localized date
$formatter = new IntlDateFormatter($isoLang, IntlDateFormatter::SHORT, IntlDateFormatter::NONE);
$item['timestamp'] = $formatter->parse($date);
}
}
$descriptionHtml = $articleHTMLContent->find('div[data-cy="ad_description"] div', 0)->innertext ?? false;
if (!$descriptionHtml) {
$descriptionHtml = $articleHTMLContent->find('div[id="description"] div[data-read-more]', 0)->innertext ?? false;
}
$item['categories'] = [];
$breadcrumbs = $articleHTMLContent->find('li[data-testid="breadcrumb-item"]');
foreach ($breadcrumbs as $breadcrumb) {
$category = $breadcrumb->find('a[href!="/"]', 0) ?? false;
if ($category) {
$item['categories'][] = $category->plaintext;
}
}
$parameters = $articleHTMLContent->find('div.parametersArea li');
foreach ($parameters as $parameter) {
$category = $parameter->find('a', 0)->plaintext ?? false;
if ($category = empty($category) ? false : trim($category)) {
if ($category == 'Tak') {
$category = $parameter->find('span', 0)->plaintext ?? '';
} elseif ($category == 'Nie') {
continue;
}
$item['categories'][] = $category;
}
}
$item['content'] = <<<CONTENT
<table>
<tbody>
<tr>
<td>
<p>$location</p>
<p><span style="font-weight:bold">$price</span> $negotiable <span>$shippingOffered</span></p>
</td>
</tr>
<tr>
<td>$descriptionHtml</td>
</tr>
</tbody>
</table>
CONTENT;
$this->items[] = $item;
}
}
}

View file

@ -28,7 +28,8 @@
"ext-openssl": "*",
"ext-libxml": "*",
"ext-simplexml": "*",
"ext-json": "*"
"ext-json": "*",
"ext-intl": "*"
},
"require-dev": {
"phpunit/phpunit": "^9",