rss-bridge/lib/contents.php

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

459 lines
15 KiB
PHP
Raw Normal View History

<?php
final class Response
{
public const STATUS_CODES = [
'100' => 'Continue',
'101' => 'Switching Protocols',
'200' => 'OK',
'201' => 'Created',
'202' => 'Accepted',
'203' => 'Non-Authoritative Information',
'204' => 'No Content',
'205' => 'Reset Content',
'206' => 'Partial Content',
'300' => 'Multiple Choices',
'301' => 'Moved Permanently',
'302' => 'Found',
'303' => 'See Other',
'304' => 'Not Modified',
'305' => 'Use Proxy',
'400' => 'Bad Request',
'401' => 'Unauthorized',
'402' => 'Payment Required',
'403' => 'Forbidden',
'404' => 'Not Found',
'405' => 'Method Not Allowed',
'406' => 'Not Acceptable',
'407' => 'Proxy Authentication Required',
'408' => 'Request Timeout',
'409' => 'Conflict',
'410' => 'Gone',
'411' => 'Length Required',
'412' => 'Precondition Failed',
'413' => 'Request Entity Too Large',
'414' => 'Request-URI Too Long',
'415' => 'Unsupported Media Type',
'416' => 'Requested Range Not Satisfiable',
'417' => 'Expectation Failed',
'429' => 'Too Many Requests',
'500' => 'Internal Server Error',
'501' => 'Not Implemented',
'502' => 'Bad Gateway',
'503' => 'Service Unavailable',
'504' => 'Gateway Timeout',
'505' => 'HTTP Version Not Supported'
];
private string $body;
private int $code;
private array $headers;
public function __construct(
string $body = '',
int $code = 200,
array $headers = []
) {
$this->body = $body;
$this->code = $code;
$this->headers = $headers;
}
public function getBody()
{
return $this->body;
}
public function getHeaders()
{
return $this->headers;
}
public function send(): void
{
http_response_code($this->code);
foreach ($this->headers as $name => $value) {
header(sprintf('%s: %s', $name, $value));
}
print $this->body;
}
}
2022-05-11 23:34:18 +03:00
/**
* Fetch data from an http url
*
* @param array $httpHeaders E.g. ['Content-type: text/plain']
* @param array $curlOptions Associative array e.g. [CURLOPT_MAXREDIRS => 3]
* @param bool $returnFull Whether to return an array:
* [
* 'code' => int,
* 'header' => array,
* 'content' => string,
* 'status_lines' => array,
* ]
2022-05-11 23:34:18 +03:00
* @return string|array
*/
function getContents(
string $url,
array $httpHeaders = [],
array $curlOptions = [],
2022-05-18 00:46:37 +03:00
bool $returnFull = false
) {
$cache = RssBridge::getCache();
$cache->setScope('server');
$cache->setKey([$url]);
// Snagged from https://github.com/lwthiker/curl-impersonate/blob/main/firefox/curl_ff102
$defaultHttpHeaders = [
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language' => 'en-US,en;q=0.5',
'Upgrade-Insecure-Requests' => '1',
'Sec-Fetch-Dest' => 'document',
'Sec-Fetch-Mode' => 'navigate',
'Sec-Fetch-Site' => 'none',
'Sec-Fetch-User' => '?1',
2023-04-21 03:50:48 +03:00
'TE' => 'trailers',
];
$httpHeadersNormalized = [];
foreach ($httpHeaders as $httpHeader) {
$parts = explode(':', $httpHeader);
$headerName = trim($parts[0]);
$headerValue = trim(implode(':', array_slice($parts, 1)));
$httpHeadersNormalized[$headerName] = $headerValue;
}
$config = [
'useragent' => Configuration::getConfig('http', 'useragent'),
'timeout' => Configuration::getConfig('http', 'timeout'),
'headers' => array_merge($defaultHttpHeaders, $httpHeadersNormalized),
'curl_options' => $curlOptions,
];
$maxFileSize = Configuration::getConfig('http', 'max_filesize');
if ($maxFileSize) {
// Multiply with 2^20 (1M) to the value in bytes
$config['max_filesize'] = $maxFileSize * 2 ** 20;
}
if (Configuration::getConfig('proxy', 'url') && !defined('NOPROXY')) {
$config['proxy'] = Configuration::getConfig('proxy', 'url');
}
if (!Debug::isEnabled() && $cache->getTime()) {
$config['if_not_modified_since'] = $cache->getTime();
}
$result = _http_request($url, $config);
$response = [
2022-05-18 00:46:37 +03:00
'code' => $result['code'],
'status_lines' => $result['status_lines'],
'header' => $result['headers'],
'content' => $result['body'],
];
switch ($result['code']) {
case 200:
case 201:
case 202:
if (isset($result['headers']['cache-control'])) {
$cachecontrol = $result['headers']['cache-control'];
$lastValue = array_pop($cachecontrol);
$directives = explode(',', $lastValue);
$directives = array_map('trim', $directives);
if (in_array('no-cache', $directives) || in_array('no-store', $directives)) {
2022-05-11 23:34:18 +03:00
// Don't cache as instructed by the server
break;
}
}
$cache->saveData($result['body']);
break;
case 301:
case 302:
case 303:
// todo: cache
break;
case 304:
// Not Modified
$response['content'] = $cache->loadData();
break;
default:
$exceptionMessage = sprintf(
'%s resulted in %s %s %s',
$url,
$result['code'],
Response::STATUS_CODES[$result['code']] ?? '',
// If debug, include a part of the response body in the exception message
Debug::isEnabled() ? mb_substr($result['body'], 0, 500) : '',
);
// The following code must be extracted if it grows too much
$cloudflareTitles = [
'<title>Just a moment...',
'<title>Please Wait...',
'<title>Attention Required!',
'<title>Security | Glassdoor',
];
foreach ($cloudflareTitles as $cloudflareTitle) {
if (str_contains($result['body'], $cloudflareTitle)) {
throw new CloudFlareException($exceptionMessage, $result['code']);
}
}
throw new HttpException($exceptionMessage, $result['code']);
}
2022-05-18 00:46:37 +03:00
if ($returnFull === true) {
return $response;
}
return $response['content'];
}
2018-11-16 23:48:59 +03:00
/**
* Fetch content from url
2018-11-16 23:48:59 +03:00
*
* @internal Private function used internally
* @throws HttpException
2018-11-16 23:48:59 +03:00
*/
function _http_request(string $url, array $config = []): array
{
$defaults = [
'useragent' => null,
'timeout' => 5,
'headers' => [],
'proxy' => null,
'curl_options' => [],
'if_not_modified_since' => null,
'retries' => 3,
'max_filesize' => null,
'max_redirections' => 5,
];
$config = array_merge($defaults, $config);
action: Add action to check bridge connectivity (#1147) * action: Add action to check bridge connectivity It is currently not simply possible to check if the remote server for a bridge is reachable or not, which means some of the bridges might no longer work because the server is no longer on the internet. In order to find those bridges we can either check each bridge individually (which takes a lot of effort), or use an automated script to do this for us. If a server is no longer reachable it could mean that it is temporarily unavailable, or shutdown permanently. The results of this script will at least help identifying such servers. * [Connectivity] Use Bootstrap container to properly display contents * [Connectivity] Limit connectivity checks to debug mode Connectivity checks take a long time to execute and can require a lot of bandwidth. Therefore, administrators should be able to determine when and who is able to utilize this action. The best way to prevent regular users from accessing this action is by making it available in debug mode only (public servers should never run in debug mode anyway). * [Connectivity] Split implemenation into multiple files * [Connectivity] Make web page responsive to user input * [Connectivity] Make status message sticky * [Connectivity] Add icon to the status message * [contents] Add the ability for getContents to return header information * [Connectivity] Add header information to the reply Json data * [Connectivity] Add new status (blue) for redirected sites Also adds titles to status icons (Successful, Redirected, Inactive, Failed) * [Connectivity] Fix show doesn't work for inactive bridges * [Connectivity] Fix typo * [Connectivity] Catch errors in promise chains * [Connectivity] Allow search by status and update dynamically * [Connectivity] Add a progress bar * [Connectivity] Use bridge factory * [Connectivity] Import Bootstrap v4.3.1 CSS
2019-11-01 00:02:38 +03:00
2022-04-10 19:54:18 +03:00
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_MAXREDIRS, $config['max_redirections']);
curl_setopt($ch, CURLOPT_HEADER, false);
$httpHeaders = [];
foreach ($config['headers'] as $name => $value) {
$httpHeaders[] = sprintf('%s: %s', $name, $value);
}
curl_setopt($ch, CURLOPT_HTTPHEADER, $httpHeaders);
if ($config['useragent']) {
curl_setopt($ch, CURLOPT_USERAGENT, $config['useragent']);
}
curl_setopt($ch, CURLOPT_TIMEOUT, $config['timeout']);
2022-04-10 19:54:18 +03:00
curl_setopt($ch, CURLOPT_ENCODING, '');
curl_setopt($ch, CURLOPT_PROTOCOLS, CURLPROTO_HTTP | CURLPROTO_HTTPS);
if ($config['max_filesize']) {
// This option inspects the Content-Length header
curl_setopt($ch, CURLOPT_MAXFILESIZE, $config['max_filesize']);
curl_setopt($ch, CURLOPT_NOPROGRESS, false);
// This progress function will monitor responses who omit the Content-Length header
curl_setopt($ch, CURLOPT_PROGRESSFUNCTION, function ($ch, $downloadSize, $downloaded, $uploadSize, $uploaded) use ($config) {
if ($downloaded > $config['max_filesize']) {
// Return a non-zero value to abort the transfer
return -1;
}
return 0;
});
}
if ($config['proxy']) {
curl_setopt($ch, CURLOPT_PROXY, $config['proxy']);
2022-04-10 19:54:18 +03:00
}
2022-05-11 23:34:18 +03:00
if (curl_setopt_array($ch, $config['curl_options']) === false) {
throw new \Exception('Tried to set an illegal curl option');
2022-04-10 19:54:18 +03:00
}
2022-05-11 23:34:18 +03:00
if ($config['if_not_modified_since']) {
curl_setopt($ch, CURLOPT_TIMEVALUE, $config['if_not_modified_since']);
2022-04-10 19:54:18 +03:00
curl_setopt($ch, CURLOPT_TIMECONDITION, CURL_TIMECOND_IFMODSINCE);
}
2022-05-18 00:46:37 +03:00
$responseStatusLines = [];
$responseHeaders = [];
2022-05-18 00:46:37 +03:00
curl_setopt($ch, CURLOPT_HEADERFUNCTION, function ($ch, $rawHeader) use (&$responseHeaders, &$responseStatusLines) {
$len = strlen($rawHeader);
2022-05-18 00:46:37 +03:00
if ($rawHeader === "\r\n") {
return $len;
}
if (preg_match('#^HTTP/(2|1.1|1.0)#', $rawHeader)) {
$responseStatusLines[] = $rawHeader;
return $len;
}
$header = explode(':', $rawHeader);
if (count($header) === 1) {
return $len;
}
$name = mb_strtolower(trim($header[0]));
$value = trim(implode(':', array_slice($header, 1)));
if (!isset($responseHeaders[$name])) {
$responseHeaders[$name] = [];
}
$responseHeaders[$name][] = $value;
return $len;
});
$attempts = 0;
while (true) {
$attempts++;
$data = curl_exec($ch);
if ($data !== false) {
// The network call was successful, so break out of the loop
action: Add action to check bridge connectivity (#1147) * action: Add action to check bridge connectivity It is currently not simply possible to check if the remote server for a bridge is reachable or not, which means some of the bridges might no longer work because the server is no longer on the internet. In order to find those bridges we can either check each bridge individually (which takes a lot of effort), or use an automated script to do this for us. If a server is no longer reachable it could mean that it is temporarily unavailable, or shutdown permanently. The results of this script will at least help identifying such servers. * [Connectivity] Use Bootstrap container to properly display contents * [Connectivity] Limit connectivity checks to debug mode Connectivity checks take a long time to execute and can require a lot of bandwidth. Therefore, administrators should be able to determine when and who is able to utilize this action. The best way to prevent regular users from accessing this action is by making it available in debug mode only (public servers should never run in debug mode anyway). * [Connectivity] Split implemenation into multiple files * [Connectivity] Make web page responsive to user input * [Connectivity] Make status message sticky * [Connectivity] Add icon to the status message * [contents] Add the ability for getContents to return header information * [Connectivity] Add header information to the reply Json data * [Connectivity] Add new status (blue) for redirected sites Also adds titles to status icons (Successful, Redirected, Inactive, Failed) * [Connectivity] Fix show doesn't work for inactive bridges * [Connectivity] Fix typo * [Connectivity] Catch errors in promise chains * [Connectivity] Allow search by status and update dynamically * [Connectivity] Add a progress bar * [Connectivity] Use bridge factory * [Connectivity] Import Bootstrap v4.3.1 CSS
2019-11-01 00:02:38 +03:00
break;
}
if ($attempts > $config['retries']) {
// Finally give up
$curl_error = curl_error($ch);
$curl_errno = curl_errno($ch);
throw new HttpException(sprintf(
'cURL error %s: %s (%s) for %s',
$curl_error,
$curl_errno,
'https://curl.haxx.se/libcurl/c/libcurl-errors.html',
$url
));
}
}
action: Add action to check bridge connectivity (#1147) * action: Add action to check bridge connectivity It is currently not simply possible to check if the remote server for a bridge is reachable or not, which means some of the bridges might no longer work because the server is no longer on the internet. In order to find those bridges we can either check each bridge individually (which takes a lot of effort), or use an automated script to do this for us. If a server is no longer reachable it could mean that it is temporarily unavailable, or shutdown permanently. The results of this script will at least help identifying such servers. * [Connectivity] Use Bootstrap container to properly display contents * [Connectivity] Limit connectivity checks to debug mode Connectivity checks take a long time to execute and can require a lot of bandwidth. Therefore, administrators should be able to determine when and who is able to utilize this action. The best way to prevent regular users from accessing this action is by making it available in debug mode only (public servers should never run in debug mode anyway). * [Connectivity] Split implemenation into multiple files * [Connectivity] Make web page responsive to user input * [Connectivity] Make status message sticky * [Connectivity] Add icon to the status message * [contents] Add the ability for getContents to return header information * [Connectivity] Add header information to the reply Json data * [Connectivity] Add new status (blue) for redirected sites Also adds titles to status icons (Successful, Redirected, Inactive, Failed) * [Connectivity] Fix show doesn't work for inactive bridges * [Connectivity] Fix typo * [Connectivity] Catch errors in promise chains * [Connectivity] Allow search by status and update dynamically * [Connectivity] Add a progress bar * [Connectivity] Use bridge factory * [Connectivity] Import Bootstrap v4.3.1 CSS
2019-11-01 00:02:38 +03:00
$statusCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
return [
'code' => $statusCode,
2022-05-18 00:46:37 +03:00
'status_lines' => $responseStatusLines,
'headers' => $responseHeaders,
'body' => $data,
];
}
2018-11-16 23:48:59 +03:00
/**
* Gets contents from the Internet as simplhtmldom object.
*
* @param string $url The URL.
* @param array $header (optional) A list of cURL header.
* For more information follow the links below.
* * https://php.net/manual/en/function.curl-setopt.php
* * https://curl.haxx.se/libcurl/c/CURLOPT_HTTPHEADER.html
* @param array $opts (optional) A list of cURL options as associative array in
* the format `$opts[$option] = $value;`, where `$option` is any `CURLOPT_XXX`
* option and `$value` the corresponding value.
*
* For more information see http://php.net/manual/en/function.curl-setopt.php
* @param bool $lowercase Force all selectors to lowercase.
* @param bool $forceTagsClosed Forcefully close tags in malformed HTML.
*
* _Remarks_: Forcefully closing tags is great for malformed HTML, but it can
* lead to parsing errors.
* @param string $target_charset Defines the target charset.
* @param bool $stripRN Replace all occurrences of `"\r"` and `"\n"` by `" "`.
* @param string $defaultBRText Specifies the replacement text for `<br>` tags
* when returning plaintext.
* @param string $defaultSpanText Specifies the replacement text for `<span />`
* tags when returning plaintext.
* @return false|simple_html_dom Contents as simplehtmldom object.
2018-11-16 23:48:59 +03:00
*/
2017-02-14 19:28:07 +03:00
function getSimpleHTMLDOM(
$url,
$header = [],
$opts = [],
$lowercase = true,
$forceTagsClosed = true,
$target_charset = DEFAULT_TARGET_CHARSET,
$stripRN = true,
$defaultBRText = DEFAULT_BR_TEXT,
$defaultSpanText = DEFAULT_SPAN_TEXT
) {
$content = getContents(
$url,
$header ?? [],
$opts ?? []
);
2017-02-14 19:28:07 +03:00
return str_get_html(
$content,
$lowercase,
$forceTagsClosed,
$target_charset,
$stripRN,
$defaultBRText,
$defaultSpanText
);
}
/**
2018-11-16 23:48:59 +03:00
* Gets contents from the Internet as simplhtmldom object. Contents are cached
* and re-used for subsequent calls until the cache duration elapsed.
*
* _Notice_: Cached contents are forcefully removed after 24 hours (86400 seconds).
*
* @param string $url The URL.
* @param int $duration Cache duration in seconds.
* @param array $header (optional) A list of cURL header.
* For more information follow the links below.
* * https://php.net/manual/en/function.curl-setopt.php
* * https://curl.haxx.se/libcurl/c/CURLOPT_HTTPHEADER.html
* @param array $opts (optional) A list of cURL options as associative array in
* the format `$opts[$option] = $value;`, where `$option` is any `CURLOPT_XXX`
* option and `$value` the corresponding value.
*
* For more information see http://php.net/manual/en/function.curl-setopt.php
* @param bool $lowercase Force all selectors to lowercase.
* @param bool $forceTagsClosed Forcefully close tags in malformed HTML.
*
* _Remarks_: Forcefully closing tags is great for malformed HTML, but it can
* lead to parsing errors.
* @param string $target_charset Defines the target charset.
* @param bool $stripRN Replace all occurrences of `"\r"` and `"\n"` by `" "`.
* @param string $defaultBRText Specifies the replacement text for `<br>` tags
* when returning plaintext.
* @param string $defaultSpanText Specifies the replacement text for `<span />`
* tags when returning plaintext.
* @return false|simple_html_dom Contents as simplehtmldom object.
*/
2017-02-14 19:28:07 +03:00
function getSimpleHTMLDOMCached(
$url,
$duration = 86400,
$header = [],
$opts = [],
$lowercase = true,
$forceTagsClosed = true,
$target_charset = DEFAULT_TARGET_CHARSET,
$stripRN = true,
$defaultBRText = DEFAULT_BR_TEXT,
$defaultSpanText = DEFAULT_SPAN_TEXT
) {
$cache = RssBridge::getCache();
$cache->setScope('pages');
$cache->setKey([$url]);
// Determine if cached file is within duration
$time = $cache->getTime();
if (
$time
&& time() - $duration < $time
&& !Debug::isEnabled()
) {
// Cache hit
$content = $cache->loadData();
} else {
$content = getContents(
$url,
$header ?? [],
$opts ?? []
);
if ($content) {
$cache->setScope('pages');
$cache->setKey([$url]);
$cache->saveData($content);
}
}
2017-02-14 19:28:07 +03:00
return str_get_html(
$content,
$lowercase,
$forceTagsClosed,
$target_charset,
$stripRN,
$defaultBRText,
$defaultSpanText
);
}