[ 'name' => 'Search URL', 'title' => 'Copy the URL from your browser\'s address bar after searching for your items and paste it here', 'pattern' => '^(https:\/\/)?(www.)?olx\.(bg|kz|pl|pt|ro|ua|uz).*$', 'exampleValue' => 'https://www.olx.pl/d/oferty/q-cebula/', 'required' => true, ], 'includePostsWithoutPricetag' => [ 'type' => 'checkbox', 'name' => 'Include posts without price tag' ], 'includeFeaturedPosts' => [ 'type' => 'checkbox', 'name' => 'Include featured posts' ], 'shippingOfferedOnly' => [ 'type' => 'checkbox', 'name' => 'Only posts with shipping offered' ] ]]; private function getHostname() { $scheme = parse_url($this->getInput('url'), PHP_URL_SCHEME); $host = parse_url($this->getInput('url'), PHP_URL_HOST); return $scheme . '://' . $host; } public function getURI() { if ($this->getInput('url')) { # make sure we order by the most recently listed offers $uri = trim(preg_replace('/([?&])search%5Border%5D=[^&]+(&|$)/', '$1', $this->getInput('url')), '?&/'); $uri = preg_replace('/([?&])view=[^&]+(&|$)/', '', $uri); $uri .= (parse_url($uri, PHP_URL_QUERY) ? '&' : '?') . 'search%5Border%5D=created_at:desc'; return $uri; } else { return parent::getURI(); } } public function getName() { $url = $this->getInput('url'); if (!$url) { return parent::getName(); } $parsedUrl = Url::fromString($url); $paths = explode('/', $parsedUrl->getPath()); $query = array_reduce($paths, function ($q, $p) { if (preg_match('/^q-(.+)$/i', $p, $matches)) { $q[] = str_replace('-', ' ', urldecode($matches[1])); } return $q; }); if ($query) { return $query[0]; } return parent::getName(); } public function collectData() { $html = getSimpleHTMLDOM($this->getURI()); $html = defaultLinkTo($html, $this->getHostname()); $isoLang = $html->find('meta[http-equiv=Content-Language]', 0)->content; # the second grid, if any, has extended results from additional categories, outside of original search scope $listing_grid = $html->find("div[data-testid='listing-grid']", 0); $results = $listing_grid->find("div[data-cy='l-card']"); foreach ($results as $post) { $item = []; if (!$this->getInput('includeFeaturedPosts') && $post->find('div[data-testid="adCard-featured"]', 0)) { continue; } $price = $post->find('p[data-testid="ad-price"]', 0)->plaintext ?? ''; if (!$this->getInput('includePostsWithoutPricetag') && !$price) { continue; } $negotiable = $post->find('p[data-testid="ad-price"] span.css-e2218f', 0)->plaintext ?? false; if ($negotiable) { $price = trim(str_replace($negotiable, '', $price)); $negotiable = '(' . $negotiable . ')'; } if ($post->find('h4', 0)->plaintext != '') { $item['uri'] = $post->find('a', 0)->href; $item['title'] = $post->find('h4', 0)->plaintext; } # ignore the date component, as it is too convoluted — use the deep-crawled one; see below $locationAndDate = $post->find('p[data-testid="location-date"]', 0)->plaintext; $locationAndDateArray = explode(' - ', $locationAndDate, 2); $location = trim($locationAndDateArray[0]); # OLX only shows 5 results before images get lazy-loaded, so we have to deep-crawl *almost* all the results. # Given that, do deep-crawl *all* the results, which allows to aso obtain the ID, the simplified location # and date strings, as well as the detailed description. $articleHTMLContent = getSimpleHTMLDOMCached($item['uri']); $articleHTMLContent = defaultLinkTo($articleHTMLContent, $this->getHostname()); $shippingOffered = $articleHTMLContent->find('img[alt="Safety Badge"]', 0)->src ?? false; if ($this->getInput('shippingOfferedOnly') && !$shippingOffered) { continue; } # Extract a clean ID without resorting to the convoluted CSS class or sibling selectors. Should be always present. $refreshLink = $articleHTMLContent->find('a[data-testid=refresh-link]', 0)->href ?? false; if ($refreshLink) { parse_str(parse_url($refreshLink, PHP_URL_QUERY), $refreshQuery); $item['uid'] = $refreshQuery['ad-id']; } else { # may be an imported offer from a sibling auto-moto classifieds platform $item['uid'] = $articleHTMLContent->find('span[id=ad_id]', 0)->plaintext; } $img = $articleHTMLContent->find('meta[property="og:image"]', 0)->content ?? false; if ($img) { $item['enclosures'] = [$img . '#.image']; } $isoDate = $articleHTMLContent->find('meta[property="og:updated_time"]', 0)->content ?? false; if ($isoDate) { $item['timestamp'] = strtotime($isoDate); } else { $date = $articleHTMLContent->find('span[data-cy="ad-posted-at"]', 0)->plaintext; # Relative, today if (preg_match('/^.*\s(\d\d:\d\d)$/i', $date, $matches)) { $item['timestamp'] = strtotime($matches[1]); } else { # full, localized date $formatter = new IntlDateFormatter($isoLang, IntlDateFormatter::SHORT, IntlDateFormatter::NONE); $item['timestamp'] = $formatter->parse($date); } } $descriptionHtml = $articleHTMLContent->find('div[data-cy="ad_description"] div', 0)->innertext ?? false; if (!$descriptionHtml) { $descriptionHtml = $articleHTMLContent->find('div[id="description"] div[data-read-more]', 0)->innertext ?? false; } $item['categories'] = []; $breadcrumbs = $articleHTMLContent->find('li[data-testid="breadcrumb-item"]'); foreach ($breadcrumbs as $breadcrumb) { $category = $breadcrumb->find('a[href!="/"]', 0) ?? false; if ($category) { $item['categories'][] = $category->plaintext; } } $parameters = $articleHTMLContent->find('div.parametersArea li'); foreach ($parameters as $parameter) { $category = $parameter->find('a', 0)->plaintext ?? false; if ($category = empty($category) ? false : trim($category)) { if ($category == 'Tak') { $category = $parameter->find('span', 0)->plaintext ?? ''; } elseif ($category == 'Nie') { continue; } $item['categories'][] = $category; } } $item['content'] = <<

$location

$price $negotiable

$descriptionHtml CONTENT; $this->items[] = $item; } } }