2023-10-12 23:14:04 +03:00
|
|
|
<?php
|
|
|
|
|
|
|
|
declare(strict_types=1);
|
|
|
|
|
2023-10-13 12:24:22 +03:00
|
|
|
/**
|
2023-10-16 03:58:03 +03:00
|
|
|
* Very basic and naive feed parser.
|
2023-10-13 12:24:22 +03:00
|
|
|
*
|
2023-10-16 03:58:03 +03:00
|
|
|
* Scrapes out rss 0.91, 1.0, 2.0 and atom 1.0.
|
2023-10-13 12:24:22 +03:00
|
|
|
*
|
2024-01-12 03:31:01 +03:00
|
|
|
* Produces array meant to be used inside rss-bridge.
|
2023-10-16 03:58:03 +03:00
|
|
|
*
|
2024-01-12 03:31:01 +03:00
|
|
|
* The item structure is tweaked so that it works with FeedItem
|
2023-10-13 12:24:22 +03:00
|
|
|
*/
|
2023-10-12 23:14:04 +03:00
|
|
|
final class FeedParser
|
|
|
|
{
|
|
|
|
public function parseFeed(string $xmlString): array
|
|
|
|
{
|
2023-10-13 02:02:19 +03:00
|
|
|
libxml_use_internal_errors(true);
|
2023-10-12 23:14:04 +03:00
|
|
|
$xml = simplexml_load_string(trim($xmlString));
|
2023-10-13 02:02:19 +03:00
|
|
|
$xmlErrors = libxml_get_errors();
|
|
|
|
libxml_use_internal_errors(false);
|
2023-10-12 23:14:04 +03:00
|
|
|
if ($xml === false) {
|
2023-10-13 03:31:09 +03:00
|
|
|
if ($xmlErrors) {
|
|
|
|
$firstXmlErrorMessage = $xmlErrors[0]->message;
|
|
|
|
}
|
|
|
|
throw new \Exception(sprintf('Unable to parse xml: %s', $firstXmlErrorMessage ?? ''));
|
2023-10-12 23:14:04 +03:00
|
|
|
}
|
|
|
|
$feed = [
|
2023-10-13 02:59:05 +03:00
|
|
|
'title' => null,
|
|
|
|
'uri' => null,
|
|
|
|
'icon' => null,
|
|
|
|
'items' => [],
|
2023-10-12 23:14:04 +03:00
|
|
|
];
|
|
|
|
if (isset($xml->item[0])) {
|
|
|
|
// rss 1.0
|
|
|
|
$channel = $xml->channel[0];
|
|
|
|
$feed['title'] = trim((string)$channel->title);
|
|
|
|
$feed['uri'] = trim((string)$channel->link);
|
|
|
|
if (!empty($channel->image)) {
|
|
|
|
$feed['icon'] = trim((string)$channel->image->url);
|
|
|
|
}
|
|
|
|
foreach ($xml->item as $item) {
|
|
|
|
$feed['items'][] = $this->parseRss1Item($item);
|
|
|
|
}
|
|
|
|
} elseif (isset($xml->channel[0])) {
|
|
|
|
// rss 2.0
|
|
|
|
$channel = $xml->channel[0];
|
|
|
|
$feed['title'] = trim((string)$channel->title);
|
|
|
|
$feed['uri'] = trim((string)$channel->link);
|
|
|
|
if (!empty($channel->image)) {
|
|
|
|
$feed['icon'] = trim((string)$channel->image->url);
|
|
|
|
}
|
|
|
|
foreach ($channel->item as $item) {
|
|
|
|
$feed['items'][] = $this->parseRss2Item($item);
|
|
|
|
}
|
|
|
|
} elseif (isset($xml->entry[0])) {
|
|
|
|
// atom 1.0
|
|
|
|
$feed['title'] = (string)$xml->title;
|
|
|
|
// Find best link (only one, or first of 'alternate')
|
|
|
|
if (!isset($xml->link)) {
|
|
|
|
$feed['uri'] = '';
|
|
|
|
} elseif (count($xml->link) === 1) {
|
|
|
|
$feed['uri'] = (string)$xml->link[0]['href'];
|
|
|
|
} else {
|
|
|
|
$feed['uri'] = '';
|
|
|
|
foreach ($xml->link as $link) {
|
|
|
|
if (strtolower((string) $link['rel']) === 'alternate') {
|
|
|
|
$feed['uri'] = (string)$link['href'];
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!empty($xml->icon)) {
|
|
|
|
$feed['icon'] = (string)$xml->icon;
|
|
|
|
} elseif (!empty($xml->logo)) {
|
|
|
|
$feed['icon'] = (string)$xml->logo;
|
|
|
|
}
|
|
|
|
foreach ($xml->entry as $item) {
|
|
|
|
$feed['items'][] = $this->parseAtomItem($item);
|
|
|
|
}
|
|
|
|
} else {
|
2023-10-13 01:26:11 +03:00
|
|
|
throw new \Exception('Unable to detect feed format');
|
2023-10-12 23:14:04 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return $feed;
|
|
|
|
}
|
|
|
|
|
|
|
|
public function parseAtomItem(\SimpleXMLElement $feedItem): array
|
|
|
|
{
|
|
|
|
$item = $this->parseRss2Item($feedItem);
|
|
|
|
if (isset($feedItem->id)) {
|
|
|
|
$item['uri'] = (string)$feedItem->id;
|
|
|
|
}
|
|
|
|
if (isset($feedItem->title)) {
|
|
|
|
$item['title'] = html_entity_decode((string)$feedItem->title);
|
|
|
|
}
|
|
|
|
if (isset($feedItem->updated)) {
|
|
|
|
$item['timestamp'] = strtotime((string)$feedItem->updated);
|
|
|
|
}
|
|
|
|
if (isset($feedItem->author)) {
|
|
|
|
$item['author'] = (string)$feedItem->author->name;
|
|
|
|
}
|
|
|
|
if (isset($feedItem->content)) {
|
|
|
|
$contentChildren = $feedItem->content->children();
|
|
|
|
if (count($contentChildren) > 0) {
|
|
|
|
$content = '';
|
|
|
|
foreach ($contentChildren as $contentChild) {
|
|
|
|
$content .= $contentChild->asXML();
|
|
|
|
}
|
|
|
|
$item['content'] = $content;
|
|
|
|
} else {
|
|
|
|
$item['content'] = (string)$feedItem->content;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// When "link" field is present, URL is more reliable than "id" field
|
|
|
|
if (count($feedItem->link) === 1) {
|
|
|
|
$item['uri'] = (string)$feedItem->link[0]['href'];
|
|
|
|
} else {
|
|
|
|
foreach ($feedItem->link as $link) {
|
|
|
|
if (strtolower((string) $link['rel']) === 'alternate') {
|
|
|
|
$item['uri'] = (string)$link['href'];
|
|
|
|
}
|
|
|
|
if (strtolower((string) $link['rel']) === 'enclosure') {
|
|
|
|
$item['enclosures'][] = (string)$link['href'];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return $item;
|
|
|
|
}
|
|
|
|
|
|
|
|
public function parseRss2Item(\SimpleXMLElement $feedItem): array
|
|
|
|
{
|
2023-10-16 03:58:03 +03:00
|
|
|
$item = [
|
|
|
|
'uri' => '',
|
|
|
|
'title' => '',
|
|
|
|
'content' => '',
|
|
|
|
'timestamp' => '',
|
|
|
|
'author' => '',
|
|
|
|
//'uid' => null,
|
|
|
|
//'categories' => [],
|
|
|
|
//'enclosures' => [],
|
|
|
|
];
|
|
|
|
|
|
|
|
foreach ($feedItem as $k => $v) {
|
|
|
|
$hasChildren = count($v) !== 0;
|
|
|
|
if (!$hasChildren) {
|
|
|
|
$item[$k] = (string) $v;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (isset($feedItem->link)) {
|
|
|
|
// todo: trim uri
|
|
|
|
$item['uri'] = (string)$feedItem->link;
|
|
|
|
}
|
|
|
|
if (isset($feedItem->title)) {
|
|
|
|
$item['title'] = html_entity_decode((string)$feedItem->title);
|
|
|
|
}
|
|
|
|
if (isset($feedItem->description)) {
|
|
|
|
$item['content'] = (string)$feedItem->description;
|
|
|
|
}
|
|
|
|
|
2023-10-12 23:14:04 +03:00
|
|
|
$namespaces = $feedItem->getNamespaces(true);
|
|
|
|
if (isset($namespaces['dc'])) {
|
|
|
|
$dc = $feedItem->children($namespaces['dc']);
|
|
|
|
}
|
|
|
|
if (isset($namespaces['media'])) {
|
|
|
|
$media = $feedItem->children($namespaces['media']);
|
|
|
|
}
|
2023-10-16 03:58:03 +03:00
|
|
|
foreach ($namespaces as $namespaceName => $namespaceUrl) {
|
|
|
|
if (in_array($namespaceName, ['', 'content', 'media'])) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
$module = $feedItem->children($namespaceUrl);
|
|
|
|
$item[$namespaceName] = [];
|
|
|
|
foreach ($module as $moduleKey => $moduleValue) {
|
|
|
|
$item[$namespaceName][$moduleKey] = (string) $moduleValue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (isset($namespaces['itunes'])) {
|
|
|
|
$enclosure = $feedItem->enclosure;
|
|
|
|
$item['enclosure'] = [
|
|
|
|
'url' => (string) $enclosure['url'],
|
|
|
|
'length' => (string) $enclosure['length'],
|
|
|
|
'type' => (string) $enclosure['type'],
|
|
|
|
];
|
|
|
|
}
|
2023-10-12 23:14:04 +03:00
|
|
|
if (isset($feedItem->guid)) {
|
2023-10-14 00:14:08 +03:00
|
|
|
// Pluck out a url from guid
|
2023-10-12 23:14:04 +03:00
|
|
|
foreach ($feedItem->guid->attributes() as $attribute => $value) {
|
|
|
|
if (
|
|
|
|
$attribute === 'isPermaLink'
|
|
|
|
&& (
|
|
|
|
$value === 'true' || (
|
|
|
|
filter_var($feedItem->guid, FILTER_VALIDATE_URL)
|
|
|
|
&& (empty($item['uri']) || !filter_var($item['uri'], FILTER_VALIDATE_URL))
|
|
|
|
)
|
|
|
|
)
|
|
|
|
) {
|
|
|
|
$item['uri'] = (string)$feedItem->guid;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (isset($feedItem->pubDate)) {
|
|
|
|
$item['timestamp'] = strtotime((string)$feedItem->pubDate);
|
|
|
|
} elseif (isset($dc->date)) {
|
|
|
|
$item['timestamp'] = strtotime((string)$dc->date);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (isset($feedItem->author)) {
|
|
|
|
$item['author'] = (string)$feedItem->author;
|
|
|
|
} elseif (isset($feedItem->creator)) {
|
|
|
|
$item['author'] = (string)$feedItem->creator;
|
|
|
|
} elseif (isset($dc->creator)) {
|
|
|
|
$item['author'] = (string)$dc->creator;
|
|
|
|
} elseif (isset($media->credit)) {
|
|
|
|
$item['author'] = (string)$media->credit;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (isset($feedItem->enclosure) && !empty($feedItem->enclosure['url'])) {
|
2023-10-13 01:26:11 +03:00
|
|
|
$item['enclosures'] = [
|
|
|
|
(string)$feedItem->enclosure['url'],
|
|
|
|
];
|
2023-10-12 23:14:04 +03:00
|
|
|
}
|
|
|
|
return $item;
|
|
|
|
}
|
|
|
|
|
|
|
|
public function parseRss1Item(\SimpleXMLElement $feedItem): array
|
|
|
|
{
|
2023-10-13 03:31:09 +03:00
|
|
|
$item = [
|
2023-10-16 03:58:03 +03:00
|
|
|
'uri' => '',
|
|
|
|
'title' => '',
|
|
|
|
'content' => '',
|
|
|
|
'timestamp' => '',
|
|
|
|
'author' => '',
|
2023-10-14 00:14:08 +03:00
|
|
|
//'uid' => null,
|
|
|
|
//'categories' => [],
|
|
|
|
//'enclosures' => [],
|
2023-10-13 03:31:09 +03:00
|
|
|
];
|
2023-10-12 23:14:04 +03:00
|
|
|
if (isset($feedItem->link)) {
|
2023-10-13 01:26:11 +03:00
|
|
|
// todo: trim uri
|
2023-10-12 23:14:04 +03:00
|
|
|
$item['uri'] = (string)$feedItem->link;
|
|
|
|
}
|
|
|
|
if (isset($feedItem->title)) {
|
|
|
|
$item['title'] = html_entity_decode((string)$feedItem->title);
|
|
|
|
}
|
|
|
|
if (isset($feedItem->description)) {
|
|
|
|
$item['content'] = (string)$feedItem->description;
|
|
|
|
}
|
2023-10-16 03:58:03 +03:00
|
|
|
$namespaces = $feedItem->getNamespaces(true);
|
|
|
|
if (isset($namespaces['dc'])) {
|
|
|
|
$dc = $feedItem->children($namespaces['dc']);
|
|
|
|
if (isset($dc->date)) {
|
|
|
|
$item['timestamp'] = strtotime((string)$dc->date);
|
|
|
|
}
|
|
|
|
if (isset($dc->creator)) {
|
|
|
|
$item['author'] = (string)$dc->creator;
|
|
|
|
}
|
|
|
|
}
|
2023-10-12 23:14:04 +03:00
|
|
|
return $item;
|
|
|
|
}
|
|
|
|
}
|