2016-09-26 00:22:33 +03:00
|
|
|
<?php
|
2022-03-11 23:18:01 +03:00
|
|
|
|
2022-05-11 23:34:18 +03:00
|
|
|
/**
|
|
|
|
* Fetch data from an http url
|
|
|
|
*
|
|
|
|
* @param array $httpHeaders E.g. ['Content-type: text/plain']
|
|
|
|
* @param array $curlOptions Associative array e.g. [CURLOPT_MAXREDIRS => 3]
|
2023-09-10 22:50:15 +03:00
|
|
|
* @param bool $returnFull Whether to return an array: ['code' => int, 'headers' => array, 'content' => string]
|
2022-05-11 23:34:18 +03:00
|
|
|
* @return string|array
|
|
|
|
*/
|
2022-05-08 04:58:57 +03:00
|
|
|
function getContents(
|
|
|
|
string $url,
|
|
|
|
array $httpHeaders = [],
|
|
|
|
array $curlOptions = [],
|
2022-05-18 00:46:37 +03:00
|
|
|
bool $returnFull = false
|
2022-05-08 04:58:57 +03:00
|
|
|
) {
|
2023-07-16 23:07:34 +03:00
|
|
|
$httpClient = RssBridge::getHttpClient();
|
2022-05-08 04:58:57 +03:00
|
|
|
|
2022-07-31 05:21:56 +03:00
|
|
|
// Snagged from https://github.com/lwthiker/curl-impersonate/blob/main/firefox/curl_ff102
|
|
|
|
$defaultHttpHeaders = [
|
|
|
|
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
|
|
|
'Accept-Language' => 'en-US,en;q=0.5',
|
|
|
|
'Upgrade-Insecure-Requests' => '1',
|
|
|
|
'Sec-Fetch-Dest' => 'document',
|
|
|
|
'Sec-Fetch-Mode' => 'navigate',
|
|
|
|
'Sec-Fetch-Site' => 'none',
|
|
|
|
'Sec-Fetch-User' => '?1',
|
2023-04-21 03:50:48 +03:00
|
|
|
'TE' => 'trailers',
|
2022-07-31 05:21:56 +03:00
|
|
|
];
|
|
|
|
$httpHeadersNormalized = [];
|
|
|
|
foreach ($httpHeaders as $httpHeader) {
|
|
|
|
$parts = explode(':', $httpHeader);
|
|
|
|
$headerName = trim($parts[0]);
|
|
|
|
$headerValue = trim(implode(':', array_slice($parts, 1)));
|
|
|
|
$httpHeadersNormalized[$headerName] = $headerValue;
|
|
|
|
}
|
2022-05-08 04:58:57 +03:00
|
|
|
$config = [
|
2022-12-13 11:53:42 +03:00
|
|
|
'useragent' => Configuration::getConfig('http', 'useragent'),
|
|
|
|
'timeout' => Configuration::getConfig('http', 'timeout'),
|
2022-07-31 05:21:56 +03:00
|
|
|
'headers' => array_merge($defaultHttpHeaders, $httpHeadersNormalized),
|
2022-05-08 04:58:57 +03:00
|
|
|
'curl_options' => $curlOptions,
|
|
|
|
];
|
2022-11-16 19:56:26 +03:00
|
|
|
|
|
|
|
$maxFileSize = Configuration::getConfig('http', 'max_filesize');
|
|
|
|
if ($maxFileSize) {
|
|
|
|
// Multiply with 2^20 (1M) to the value in bytes
|
|
|
|
$config['max_filesize'] = $maxFileSize * 2 ** 20;
|
|
|
|
}
|
|
|
|
|
2022-07-24 20:26:12 +03:00
|
|
|
if (Configuration::getConfig('proxy', 'url') && !defined('NOPROXY')) {
|
|
|
|
$config['proxy'] = Configuration::getConfig('proxy', 'url');
|
2022-03-11 23:18:01 +03:00
|
|
|
}
|
2023-07-19 06:05:49 +03:00
|
|
|
|
|
|
|
$cache = RssBridge::getCache();
|
2023-09-10 22:50:15 +03:00
|
|
|
$cacheKey = 'server_' . $url;
|
|
|
|
|
|
|
|
/** @var Response $cachedResponse */
|
|
|
|
$cachedResponse = $cache->get($cacheKey);
|
|
|
|
if ($cachedResponse) {
|
|
|
|
// considering popping
|
|
|
|
$cachedLastModified = $cachedResponse->getHeader('last-modified');
|
|
|
|
if ($cachedLastModified) {
|
|
|
|
$cachedLastModified = new \DateTimeImmutable($cachedLastModified);
|
|
|
|
$config['if_not_modified_since'] = $cachedLastModified->getTimestamp();
|
|
|
|
}
|
2022-03-11 23:18:01 +03:00
|
|
|
}
|
|
|
|
|
2023-07-16 23:07:34 +03:00
|
|
|
$response = $httpClient->request($url, $config);
|
2022-05-08 04:58:57 +03:00
|
|
|
|
2023-09-10 22:50:15 +03:00
|
|
|
switch ($response->getCode()) {
|
2022-05-08 04:58:57 +03:00
|
|
|
case 200:
|
|
|
|
case 201:
|
|
|
|
case 202:
|
2023-09-10 22:50:15 +03:00
|
|
|
$cacheControl = $response->getHeader('cache-control');
|
|
|
|
if ($cacheControl) {
|
|
|
|
$directives = explode(',', $cacheControl);
|
2022-05-08 04:58:57 +03:00
|
|
|
$directives = array_map('trim', $directives);
|
|
|
|
if (in_array('no-cache', $directives) || in_array('no-store', $directives)) {
|
2022-05-11 23:34:18 +03:00
|
|
|
// Don't cache as instructed by the server
|
2022-05-08 04:58:57 +03:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2023-09-10 22:50:15 +03:00
|
|
|
$cache->set($cacheKey, $response, 86400 * 10);
|
2022-05-08 04:58:57 +03:00
|
|
|
break;
|
2022-09-06 16:40:20 +03:00
|
|
|
case 301:
|
|
|
|
case 302:
|
|
|
|
case 303:
|
|
|
|
// todo: cache
|
|
|
|
break;
|
2022-08-06 23:46:28 +03:00
|
|
|
case 304:
|
|
|
|
// Not Modified
|
2023-09-10 22:50:15 +03:00
|
|
|
$response = $response->withBody($cachedResponse->getBody());
|
2022-05-08 04:58:57 +03:00
|
|
|
break;
|
|
|
|
default:
|
2023-03-20 21:11:51 +03:00
|
|
|
$exceptionMessage = sprintf(
|
|
|
|
'%s resulted in %s %s %s',
|
|
|
|
$url,
|
2023-09-10 22:50:15 +03:00
|
|
|
$response->getCode(),
|
|
|
|
$response->getStatusLine(),
|
2023-03-20 21:11:51 +03:00
|
|
|
// If debug, include a part of the response body in the exception message
|
2023-09-10 22:50:15 +03:00
|
|
|
Debug::isEnabled() ? mb_substr($response->getBody(), 0, 500) : '',
|
2023-03-20 21:11:51 +03:00
|
|
|
);
|
|
|
|
|
|
|
|
// The following code must be extracted if it grows too much
|
|
|
|
$cloudflareTitles = [
|
|
|
|
'<title>Just a moment...',
|
|
|
|
'<title>Please Wait...',
|
2023-05-11 02:33:38 +03:00
|
|
|
'<title>Attention Required!',
|
|
|
|
'<title>Security | Glassdoor',
|
2023-03-20 21:11:51 +03:00
|
|
|
];
|
|
|
|
foreach ($cloudflareTitles as $cloudflareTitle) {
|
2023-09-10 22:50:15 +03:00
|
|
|
if (str_contains($response->getBody(), $cloudflareTitle)) {
|
|
|
|
throw new CloudFlareException($exceptionMessage, $response->getCode());
|
2023-03-20 21:11:51 +03:00
|
|
|
}
|
2022-10-29 09:42:50 +03:00
|
|
|
}
|
2023-09-10 22:50:15 +03:00
|
|
|
throw new HttpException(trim($exceptionMessage), $response->getCode());
|
2022-03-11 23:18:01 +03:00
|
|
|
}
|
2022-05-18 00:46:37 +03:00
|
|
|
if ($returnFull === true) {
|
2023-07-16 23:07:34 +03:00
|
|
|
return [
|
2023-09-10 22:50:15 +03:00
|
|
|
'code' => $response->getCode(),
|
|
|
|
'headers' => $response->getHeaders(),
|
|
|
|
// For legacy reasons, use 'content' instead of 'body'
|
|
|
|
'content' => $response->getBody(),
|
2023-07-16 23:07:34 +03:00
|
|
|
];
|
|
|
|
}
|
2023-09-10 22:50:15 +03:00
|
|
|
return $response->getBody();
|
2016-09-26 00:22:33 +03:00
|
|
|
}
|
|
|
|
|
2018-11-16 23:48:59 +03:00
|
|
|
/**
|
|
|
|
* Gets contents from the Internet as simplhtmldom object.
|
|
|
|
*
|
|
|
|
* @param string $url The URL.
|
|
|
|
* @param array $header (optional) A list of cURL header.
|
|
|
|
* For more information follow the links below.
|
|
|
|
* * https://php.net/manual/en/function.curl-setopt.php
|
|
|
|
* * https://curl.haxx.se/libcurl/c/CURLOPT_HTTPHEADER.html
|
|
|
|
* @param array $opts (optional) A list of cURL options as associative array in
|
|
|
|
* the format `$opts[$option] = $value;`, where `$option` is any `CURLOPT_XXX`
|
|
|
|
* option and `$value` the corresponding value.
|
|
|
|
*
|
|
|
|
* For more information see http://php.net/manual/en/function.curl-setopt.php
|
|
|
|
* @param bool $lowercase Force all selectors to lowercase.
|
|
|
|
* @param bool $forceTagsClosed Forcefully close tags in malformed HTML.
|
|
|
|
*
|
|
|
|
* _Remarks_: Forcefully closing tags is great for malformed HTML, but it can
|
|
|
|
* lead to parsing errors.
|
|
|
|
* @param string $target_charset Defines the target charset.
|
|
|
|
* @param bool $stripRN Replace all occurrences of `"\r"` and `"\n"` by `" "`.
|
|
|
|
* @param string $defaultBRText Specifies the replacement text for `<br>` tags
|
|
|
|
* when returning plaintext.
|
|
|
|
* @param string $defaultSpanText Specifies the replacement text for `<span />`
|
|
|
|
* tags when returning plaintext.
|
2020-11-08 10:19:18 +03:00
|
|
|
* @return false|simple_html_dom Contents as simplehtmldom object.
|
2018-11-16 23:48:59 +03:00
|
|
|
*/
|
2017-02-14 19:28:07 +03:00
|
|
|
function getSimpleHTMLDOM(
|
|
|
|
$url,
|
2019-03-20 19:59:16 +03:00
|
|
|
$header = [],
|
|
|
|
$opts = [],
|
|
|
|
$lowercase = true,
|
|
|
|
$forceTagsClosed = true,
|
|
|
|
$target_charset = DEFAULT_TARGET_CHARSET,
|
|
|
|
$stripRN = true,
|
|
|
|
$defaultBRText = DEFAULT_BR_TEXT,
|
|
|
|
$defaultSpanText = DEFAULT_SPAN_TEXT
|
|
|
|
) {
|
2022-05-08 04:58:57 +03:00
|
|
|
$content = getContents(
|
|
|
|
$url,
|
|
|
|
$header ?? [],
|
|
|
|
$opts ?? []
|
|
|
|
);
|
2017-02-14 19:28:07 +03:00
|
|
|
return str_get_html(
|
|
|
|
$content,
|
|
|
|
$lowercase,
|
|
|
|
$forceTagsClosed,
|
|
|
|
$target_charset,
|
|
|
|
$stripRN,
|
|
|
|
$defaultBRText,
|
|
|
|
$defaultSpanText
|
|
|
|
);
|
2016-09-26 00:22:33 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2018-11-16 23:48:59 +03:00
|
|
|
* Gets contents from the Internet as simplhtmldom object. Contents are cached
|
|
|
|
* and re-used for subsequent calls until the cache duration elapsed.
|
|
|
|
*
|
|
|
|
* _Notice_: Cached contents are forcefully removed after 24 hours (86400 seconds).
|
|
|
|
*
|
|
|
|
* @param string $url The URL.
|
2023-09-10 22:50:15 +03:00
|
|
|
* @param int $ttl Cache duration in seconds.
|
2018-11-16 23:48:59 +03:00
|
|
|
* @param array $header (optional) A list of cURL header.
|
|
|
|
* For more information follow the links below.
|
|
|
|
* * https://php.net/manual/en/function.curl-setopt.php
|
|
|
|
* * https://curl.haxx.se/libcurl/c/CURLOPT_HTTPHEADER.html
|
|
|
|
* @param array $opts (optional) A list of cURL options as associative array in
|
|
|
|
* the format `$opts[$option] = $value;`, where `$option` is any `CURLOPT_XXX`
|
|
|
|
* option and `$value` the corresponding value.
|
|
|
|
*
|
|
|
|
* For more information see http://php.net/manual/en/function.curl-setopt.php
|
|
|
|
* @param bool $lowercase Force all selectors to lowercase.
|
|
|
|
* @param bool $forceTagsClosed Forcefully close tags in malformed HTML.
|
|
|
|
*
|
|
|
|
* _Remarks_: Forcefully closing tags is great for malformed HTML, but it can
|
|
|
|
* lead to parsing errors.
|
|
|
|
* @param string $target_charset Defines the target charset.
|
|
|
|
* @param bool $stripRN Replace all occurrences of `"\r"` and `"\n"` by `" "`.
|
|
|
|
* @param string $defaultBRText Specifies the replacement text for `<br>` tags
|
|
|
|
* when returning plaintext.
|
|
|
|
* @param string $defaultSpanText Specifies the replacement text for `<span />`
|
|
|
|
* tags when returning plaintext.
|
2020-11-08 10:19:18 +03:00
|
|
|
* @return false|simple_html_dom Contents as simplehtmldom object.
|
2016-09-26 00:22:33 +03:00
|
|
|
*/
|
2017-02-14 19:28:07 +03:00
|
|
|
function getSimpleHTMLDOMCached(
|
|
|
|
$url,
|
2023-09-10 22:50:15 +03:00
|
|
|
$ttl = 86400,
|
2019-03-20 19:59:16 +03:00
|
|
|
$header = [],
|
|
|
|
$opts = [],
|
|
|
|
$lowercase = true,
|
|
|
|
$forceTagsClosed = true,
|
|
|
|
$target_charset = DEFAULT_TARGET_CHARSET,
|
|
|
|
$stripRN = true,
|
|
|
|
$defaultBRText = DEFAULT_BR_TEXT,
|
|
|
|
$defaultSpanText = DEFAULT_SPAN_TEXT
|
|
|
|
) {
|
2023-07-08 18:03:12 +03:00
|
|
|
$cache = RssBridge::getCache();
|
2023-09-10 22:50:15 +03:00
|
|
|
$cacheKey = 'pages_' . $url;
|
|
|
|
$content = $cache->get($cacheKey);
|
|
|
|
if (!$content) {
|
2023-07-19 06:05:49 +03:00
|
|
|
$content = getContents($url, $header ?? [], $opts ?? []);
|
2023-09-10 22:50:15 +03:00
|
|
|
$cache->set($cacheKey, $content, $ttl);
|
2016-09-26 00:22:33 +03:00
|
|
|
}
|
2017-02-14 19:28:07 +03:00
|
|
|
return str_get_html(
|
|
|
|
$content,
|
|
|
|
$lowercase,
|
|
|
|
$forceTagsClosed,
|
|
|
|
$target_charset,
|
|
|
|
$stripRN,
|
|
|
|
$defaultBRText,
|
|
|
|
$defaultSpanText
|
|
|
|
);
|
2016-09-26 00:22:33 +03:00
|
|
|
}
|