fix(FeedExpander): if parse fails, include offending url in exception message (#3938)

Also some refactors
This commit is contained in:
Dag 2024-01-29 21:51:34 +01:00 committed by GitHub
parent b2c8475b2c
commit d01c462ad5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 56 additions and 41 deletions

View file

@ -4,14 +4,14 @@ class DetectAction implements ActionInterface
{
public function execute(Request $request)
{
$targetURL = $request->get('url');
$url = $request->get('url');
$format = $request->get('format');
if (!$targetURL) {
throw new \Exception('You must specify a url!');
if (!$url) {
return new Response(render(__DIR__ . '/../templates/error.html.php', ['message' => 'You must specify a url']));
}
if (!$format) {
throw new \Exception('You must specify a format!');
return new Response(render(__DIR__ . '/../templates/error.html.php', ['message' => 'You must specify a format']));
}
$bridgeFactory = new BridgeFactory();
@ -23,19 +23,23 @@ class DetectAction implements ActionInterface
$bridge = $bridgeFactory->create($bridgeClassName);
$bridgeParams = $bridge->detectParameters($targetURL);
$bridgeParams = $bridge->detectParameters($url);
if (is_null($bridgeParams)) {
if (!$bridgeParams) {
continue;
}
$bridgeParams['bridge'] = $bridgeClassName;
$bridgeParams['format'] = $format;
$url = '?action=display&' . http_build_query($bridgeParams);
return new Response('', 301, ['location' => $url]);
$query = [
'action' => 'display',
'bridge' => $bridgeClassName,
'format' => $format,
];
$query = array_merge($query, $bridgeParams);
return new Response('', 301, ['location' => '?' . http_build_query($query)]);
}
throw new \Exception('No bridge found for given URL: ' . $targetURL);
return new Response(render(__DIR__ . '/../templates/error.html.php', [
'message' => 'No bridge found for given URL: ' . $url,
]));
}
}

View file

@ -9,10 +9,10 @@ class FindfeedAction implements ActionInterface
{
public function execute(Request $request)
{
$targetURL = $request->get('url');
$url = $request->get('url');
$format = $request->get('format');
if (!$targetURL) {
if (!$url) {
return new Response('You must specify a url', 400);
}
if (!$format) {
@ -29,7 +29,7 @@ class FindfeedAction implements ActionInterface
$bridge = $bridgeFactory->create($bridgeClassName);
$bridgeParams = $bridge->detectParameters($targetURL);
$bridgeParams = $bridge->detectParameters($url);
if ($bridgeParams === null) {
continue;

View file

@ -102,12 +102,17 @@ class BMDSystemhausBlogBridge extends BridgeAbstract
//-----------------------------------------------------
public function detectParameters($url)
{
$parsed_url = parse_url($url);
if ($parsed_url['host'] != 'www.bmd.com') {
try {
$parsedUrl = Url::fromString($url);
} catch (UrlException $e) {
return null;
}
$path = explode('/', $parsed_url['path']);
if ($parsedUrl->getHost() != 'www.bmd.com') {
return null;
}
$path = explode('/', $parsedUrl->getPath());
if ($this->getURIbyCountry($path[1]) == '') {
return null;

View file

@ -19,23 +19,6 @@ class FirefoxAddonsBridge extends BridgeAbstract
const CACHE_TIMEOUT = 3600;
private $feedName = '';
private $releaseDateRegex = '/Released ([\w, ]+) - ([\w. ]+)/';
private $xpiFileRegex = '/([A-Za-z0-9_.-]+)\.xpi$/';
private $outgoingRegex = '/https:\/\/prod.outgoing\.prod\.webservices\.mozgcp\.net\/v1\/(?:[A-z0-9]+)\//';
private $urlRegex = '/addons\.mozilla\.org\/(?:[\w-]+\/)?firefox\/addon\/([\w-]+)/';
public function detectParameters($url)
{
$params = [];
if (preg_match($this->urlRegex, $url, $matches)) {
$params['id'] = $matches[1];
return $params;
}
return null;
}
public function collectData()
{
@ -52,7 +35,8 @@ class FirefoxAddonsBridge extends BridgeAbstract
$item['uri'] = $this->getURI();
$item['author'] = $author;
if (preg_match($this->releaseDateRegex, $li->find('div.AddonVersionCard-fileInfo', 0)->plaintext, $match)) {
$releaseDateRegex = '/Released ([\w, ]+) - ([\w. ]+)/';
if (preg_match($releaseDateRegex, $li->find('div.AddonVersionCard-fileInfo', 0)->plaintext, $match)) {
$item['timestamp'] = $match[1];
$size = $match[2];
}
@ -68,7 +52,8 @@ class FirefoxAddonsBridge extends BridgeAbstract
$releaseNotes = $this->removeLinkRedirects($li->find('div.AddonVersionCard-releaseNotes', 0));
if (preg_match($this->xpiFileRegex, $downloadlink, $match)) {
$xpiFileRegex = '/([A-Za-z0-9_.-]+)\.xpi$/';
if (preg_match($xpiFileRegex, $downloadlink, $match)) {
$xpiFilename = $match[0];
}
@ -110,10 +95,25 @@ EOD;
*/
private function removeLinkRedirects($html)
{
$outgoingRegex = '/https:\/\/prod.outgoing\.prod\.webservices\.mozgcp\.net\/v1\/(?:[A-z0-9]+)\//';
foreach ($html->find('a') as $a) {
$a->href = urldecode(preg_replace($this->outgoingRegex, '', $a->href));
$a->href = urldecode(preg_replace($outgoingRegex, '', $a->href));
}
return $html->innertext;
}
public function detectParameters($url)
{
$params = [];
// Example: https://addons.mozilla.org/en-US/firefox/addon/ublock-origin
$pattern = '/addons\.mozilla\.org\/(?:[\w-]+\/)?firefox\/addon\/([\w-]+)/';
if (preg_match($pattern, $url, $matches)) {
$params['id'] = $matches[1];
return $params;
}
return null;
}
}

View file

@ -23,14 +23,20 @@ abstract class FeedExpander extends BridgeAbstract
throw new \Exception(sprintf('Unable to parse xml from `%s` because we got the empty string', $url), 10);
}
// prepare/massage the xml to make it more acceptable
$badStrings = [
$problematicStrings = [
' ',
'»',
'’',
];
$xmlString = str_replace($badStrings, '', $xmlString);
$xmlString = str_replace($problematicStrings, '', $xmlString);
$feedParser = new FeedParser();
$this->feed = $feedParser->parseFeed($xmlString);
try {
$this->feed = $feedParser->parseFeed($xmlString);
} catch (\Exception $e) {
throw new \Exception(sprintf('Failed to parse xml from %s: %s', $url, create_sane_exception_message($e)));
}
$items = array_slice($this->feed['items'], 0, $maxItems);
// todo: extract parse logic out from FeedParser
foreach ($items as $item) {