2022-03-25 05:58:54 -04:00
|
|
|
<?php
|
|
|
|
|
2022-07-01 15:10:30 +02:00
|
|
|
class CraigslistBridge extends BridgeAbstract
|
|
|
|
{
|
|
|
|
const NAME = 'Craigslist Bridge';
|
|
|
|
const URI = 'https://craigslist.org/';
|
|
|
|
const DESCRIPTION = 'Returns craigslist search results';
|
2022-03-25 05:58:54 -04:00
|
|
|
|
2022-07-01 15:10:30 +02:00
|
|
|
const PARAMETERS = [ [
|
|
|
|
'region' => [
|
|
|
|
'name' => 'Region',
|
|
|
|
'title' => 'The subdomain before craigslist.org in the URL',
|
|
|
|
'exampleValue' => 'sfbay',
|
|
|
|
'required' => true
|
|
|
|
],
|
|
|
|
'search' => [
|
|
|
|
'name' => 'Search Query',
|
|
|
|
'title' => 'Everything in the URL after /search/',
|
|
|
|
'exampleValue' => 'sya?query=laptop',
|
|
|
|
'required' => true
|
|
|
|
],
|
|
|
|
'limit' => [
|
|
|
|
'name' => 'Number of Posts',
|
|
|
|
'type' => 'number',
|
|
|
|
'title' => 'The maximum number of posts is 120. Use 0 for unlimited posts.',
|
|
|
|
'defaultValue' => '25'
|
|
|
|
]
|
|
|
|
]];
|
2022-03-25 05:58:54 -04:00
|
|
|
|
2022-07-01 15:10:30 +02:00
|
|
|
const TEST_DETECT_PARAMETERS = [
|
|
|
|
'https://sfbay.craigslist.org/search/sya?query=laptop' => [
|
|
|
|
'region' => 'sfbay', 'search' => 'sya?query=laptop'
|
|
|
|
],
|
|
|
|
'https://newyork.craigslist.org/search/sss?query=32gb+flash+drive&bundleDuplicates=1&max_price=20' => [
|
|
|
|
'region' => 'newyork', 'search' => 'sss?query=32gb+flash+drive&bundleDuplicates=1&max_price=20'
|
|
|
|
],
|
|
|
|
];
|
2022-03-25 05:58:54 -04:00
|
|
|
|
2022-07-01 15:10:30 +02:00
|
|
|
const URL_REGEX = '/^https:\/\/(?<region>\w+).craigslist.org\/search\/(?<search>.+)/';
|
2022-03-25 05:58:54 -04:00
|
|
|
|
2022-07-01 15:10:30 +02:00
|
|
|
public function detectParameters($url)
|
|
|
|
{
|
|
|
|
if (preg_match(self::URL_REGEX, $url, $matches)) {
|
|
|
|
$params = [];
|
|
|
|
$params['region'] = $matches['region'];
|
|
|
|
$params['search'] = $matches['search'];
|
|
|
|
return $params;
|
|
|
|
}
|
|
|
|
}
|
2022-03-25 05:58:54 -04:00
|
|
|
|
2022-07-01 15:10:30 +02:00
|
|
|
public function getURI()
|
|
|
|
{
|
|
|
|
if (!is_null($this->getInput('region'))) {
|
|
|
|
$domain = 'https://' . $this->getInput('region') . '.craigslist.org/search/';
|
|
|
|
return urljoin($domain, $this->getInput('search'));
|
|
|
|
}
|
|
|
|
return parent::getURI();
|
|
|
|
}
|
2022-03-25 05:58:54 -04:00
|
|
|
|
2022-07-01 15:10:30 +02:00
|
|
|
public function collectData()
|
|
|
|
{
|
|
|
|
$uri = $this->getURI();
|
|
|
|
$html = getSimpleHTMLDOM($uri);
|
2022-03-25 05:58:54 -04:00
|
|
|
|
2022-07-01 15:10:30 +02:00
|
|
|
// Check if no results page is shown (nearby results)
|
2023-07-08 23:21:55 +02:00
|
|
|
if (($html->find('.displaycountShow', 0)->plaintext ?? '') == '0') {
|
2022-07-01 15:10:30 +02:00
|
|
|
return;
|
|
|
|
}
|
2022-03-25 05:58:54 -04:00
|
|
|
|
2022-07-01 15:10:30 +02:00
|
|
|
// Search for "more from nearby areas" banner in order to skip those results
|
|
|
|
$results = $html->find('.result-row, h4.nearby');
|
2022-03-25 05:58:54 -04:00
|
|
|
|
2022-07-01 15:10:30 +02:00
|
|
|
// Limit the number of posts
|
|
|
|
if ($this->getInput('limit') > 0) {
|
|
|
|
$results = array_slice($results, 0, $this->getInput('limit'));
|
|
|
|
}
|
2022-03-25 05:58:54 -04:00
|
|
|
|
2022-07-01 15:10:30 +02:00
|
|
|
foreach ($results as $post) {
|
|
|
|
// Skip "nearby results" banner and results
|
|
|
|
// This only appears when searchNearby is not specified
|
|
|
|
if ($post->tag == 'h4') {
|
|
|
|
break;
|
|
|
|
}
|
2022-03-25 05:58:54 -04:00
|
|
|
|
2022-07-01 15:10:30 +02:00
|
|
|
$item = [];
|
2022-03-25 05:58:54 -04:00
|
|
|
|
2022-07-01 15:10:30 +02:00
|
|
|
$heading = $post->find('.result-heading a', 0);
|
|
|
|
$item['uri'] = $heading->href;
|
|
|
|
$item['title'] = $heading->plaintext;
|
|
|
|
$item['timestamp'] = $post->find('.result-date', 0)->datetime;
|
|
|
|
$item['uid'] = $heading->id;
|
2022-09-06 00:14:20 +02:00
|
|
|
|
|
|
|
$price = $post->find('.result-price', 0)->plaintext ?? '';
|
|
|
|
// Find the location (local and nearby results if searchNearby=1)
|
|
|
|
$nearby = $post->find('.result-hood, span.nearby', 0)->plaintext ?? '';
|
|
|
|
$item['content'] = sprintf('%s %s', $price, $nearby);
|
2022-03-25 05:58:54 -04:00
|
|
|
|
2022-07-01 15:10:30 +02:00
|
|
|
$images = $post->find('.result-image[data-ids]', 0);
|
|
|
|
if (!is_null($images)) {
|
|
|
|
$item['content'] .= '<br>';
|
|
|
|
foreach (explode(',', $images->getAttribute('data-ids')) as $image) {
|
|
|
|
// Remove leading 3: from each image id
|
|
|
|
$id = substr($image, 2);
|
|
|
|
$image_uri = 'https://images.craigslist.org/' . $id . '_300x300.jpg';
|
|
|
|
$item['content'] .= '<img src="' . $image_uri . '">';
|
|
|
|
$item['enclosures'][] = $image_uri;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
$this->items[] = $item;
|
|
|
|
}
|
|
|
|
}
|
2022-03-25 05:58:54 -04:00
|
|
|
}
|