[ 'name' => 'Group or user name', 'exampleValue' => 'elonmusk_tech', 'required' => true ], 'hide_reposts' => [ 'name' => 'Hide reposts', 'type' => 'checkbox', ] ] ]; const TEST_DETECT_PARAMETERS = [ 'https://vk.com/id1' => ['u' => 'id1'], 'https://vk.com/groupname' => ['u' => 'groupname'], 'https://m.vk.com/groupname' => ['u' => 'groupname'], 'https://vk.com/groupname/anythingelse' => ['u' => 'groupname'], 'https://vk.com/groupname?w=somethingelse' => ['u' => 'groupname'], 'https://vk.com/with_underscore' => ['u' => 'with_underscore'], ]; protected $pageName; protected $tz = 0; private $urlRegex = '/vk\.com\/([\w]+)/'; public function getURI() { if (!is_null($this->getInput('u'))) { return static::URI . urlencode($this->getInput('u')); } return parent::getURI(); } public function getName() { if ($this->pageName) { return $this->pageName; } return parent::getName(); } public function detectParameters($url) { if (preg_match($this->urlRegex, $url, $matches)) { return ['u' => $matches[1]]; } return null; } public function collectData() { $text_html = $this->getContents(); $text_html = iconv('windows-1251', 'utf-8//ignore', $text_html); $html = str_get_html($text_html); foreach ($html->find('script') as $script) { preg_match('/tz: ([0-9]+)/', $script->outertext, $matches); if (count($matches) > 0) { $this->tz = intval($matches[1]); break; } } $pageName = $html->find('meta[property="og:title"]', 0); if (is_object($pageName)) { $pageName = $pageName->getAttribute('content'); $this->pageName = $pageName; } foreach ($html->find('div.replies') as $comment_block) { $comment_block->outertext = ''; } // expensive operation $save = $html->save(); $html->load($save); $pinned_post_item = null; $last_post_id = 0; foreach ($html->find('.post') as $post) { if ($post->find('.wall_post_text_deleted')) { // repost of deleted post continue; } defaultLinkTo($post, self::URI); $is_pinned_post = false; if (strpos($post->getAttribute('class'), 'post_fixed') !== false) { $is_pinned_post = true; } // Remove 'Show more' button foreach ($post->find('button.PostTextMore') as $junk) { $junk->outertext = ''; } $content_suffix = ''; // looking for external links $external_link_selectors = [ 'a.page_media_link_title', 'div.page_media_link_title > a', 'div.media_desc > a.lnk', ]; foreach ($external_link_selectors as $sel) { if (is_object($post->find($sel, 0))) { $a = $post->find($sel, 0); $innertext = $a->innertext; $parsed_url = parse_url($a->getAttribute('href')); if (strpos($parsed_url['path'], '/away.php') !== 0) { continue; } parse_str($parsed_url['query'], $parsed_query); $content_suffix .= "
External link: $innertext"; } } // remove external link from content $external_link_selectors_to_remove = [ 'div.page_media_thumbed_link', 'div.page_media_link_desc_wrap', 'div.media_desc > a.lnk', ]; foreach ($external_link_selectors_to_remove as $sel) { if (is_object($post->find($sel, 0))) { $post->find($sel, 0)->outertext = ''; } } // looking for article $article = $post->find('a.article_snippet', 0); if (is_object($article)) { if (strpos($article->getAttribute('class'), 'article_snippet_mini') !== false) { $article_title_selector = 'div.article_snippet_mini_title'; $article_author_selector = 'div.article_snippet_mini_info > .mem_link, div.article_snippet_mini_info > .group_link'; $article_thumb_selector = 'div.article_snippet_mini_thumb'; } else { $article_title_selector = 'div.article_snippet__title'; $article_author_selector = 'div.article_snippet__author'; $article_thumb_selector = 'div.article_snippet__image'; } $article_title = $article->find($article_title_selector, 0)->innertext; $article_author = $article->find($article_author_selector, 0)->innertext; $article_link = $article->getAttribute('href'); $article_img_element_style = $article->find($article_thumb_selector, 0)->getAttribute('style'); preg_match('/background-image: url\((.*)\)/', $article_img_element_style, $matches); if (count($matches) > 0) { $content_suffix .= "
"; } $content_suffix .= "
Article: $article_title ($article_author)"; $article->outertext = ''; } // get all videos foreach ($post->find('a.page_post_thumb_video') as $a) { $video_title = htmlspecialchars_decode($a->getAttribute('aria-label')); $video_title_split_pos = strrpos($video_title, ' is '); if ($video_title_split_pos !== false) { $video_title = substr($video_title, 0, $video_title_split_pos); } $video_link = $a->getAttribute('href'); $this->appendVideo($video_title, $video_link, backgroundToImg($a), $content_suffix); $a->outertext = ''; } // get all photos foreach ($post->find('div.wall_text a.page_post_thumb_wrap') as $a) { $result = $this->getPhoto($a); if ($result == null) { continue; } $a->outertext = ''; $content_suffix .= "
$result"; } // get albums foreach ($post->find('.page_album_wrap') as $el) { $a = $el->find('.page_album_link', 0); $album_title = $a->find('.page_album_title_text', 0)->getAttribute('title'); $album_link = $a->getAttribute('href'); $el->outertext = ''; $content_suffix .= "
Album: $album_title"; } // get photo documents foreach ($post->find('a.page_doc_photo_href') as $a) { $doc_link = $a->getAttribute('href'); $doc_gif_label_element = $a->find('.page_gif_label', 0); $doc_title_element = $a->find('.doc_label', 0); if (is_object($doc_gif_label_element)) { $gif_preview_img = backgroundToImg($a->find('.page_doc_photo', 0)); $content_suffix .= "
Gif: $gif_preview_img"; } elseif (is_object($doc_title_element)) { $doc_title = $doc_title_element->innertext; $content_suffix .= "
Doc: $doc_title"; } else { continue; } $a->outertext = ''; } // get other documents foreach ($post->find('div.page_doc_row') as $div) { $doc_title_element = $div->find('a.page_doc_title', 0); if (is_object($doc_title_element)) { $doc_title = $doc_title_element->innertext; $doc_link = $doc_title_element->getAttribute('href'); $content_suffix .= "
Doc: $doc_title"; } else { continue; } $div->outertext = ''; } // get polls foreach ($post->find('div.page_media_poll_wrap') as $div) { $poll_title = $div->find('.page_media_poll_title', 0)->innertext; $content_suffix .= "
Poll: $poll_title"; foreach ($div->find('div.page_poll_text') as $poll_stat_title) { $content_suffix .= '
- ' . $poll_stat_title->innertext; } $div->outertext = ''; } // get sign / post author $post_author = $pageName; $author_selectors = ['a.wall_signed_by', 'a.author']; foreach ($author_selectors as $author_selector) { $a = $post->find($author_selector, 0); if (is_object($a)) { $post_author = $a->innertext; $a->outertext = ''; break; } } // fix links and get post hashtags $hashtags = []; foreach ($post->find('a') as $a) { $href = $a->getAttribute('href'); $innertext = $a->innertext; $hashtag_prefix = '/feed?section=search&q=%23'; $hashtag = null; if ($href && substr($href, 0, strlen($hashtag_prefix)) === $hashtag_prefix) { $hashtag = urldecode(substr($href, strlen($hashtag_prefix))); } elseif (substr($innertext, 0, 1) == '#') { $hashtag = $innertext; } if ($hashtag) { $a->outertext = $innertext; $hashtags[] = $hashtag; continue; } $parsed_url = parse_url($href); if (array_key_exists('path', $parsed_url) === false) { continue; } if (strpos($parsed_url['path'], '/away.php') === 0) { parse_str($parsed_url['query'], $parsed_query); $a->setAttribute('href', iconv( 'windows-1251', 'utf-8//ignore', $parsed_query['to'] )); } } $copy_quote = $post->find('div.copy_quote', 0); if (is_object($copy_quote)) { if ($this->getInput('hide_reposts') === true) { continue; } if ($copy_post_header = $copy_quote->find('div.copy_post_header', 0)) { $copy_post_header->outertext = ''; } $second_copy_quote = $copy_quote->find('div.published_sec_quote', 0); if (is_object($second_copy_quote)) { $second_copy_quote_author = $second_copy_quote->find('a.copy_author', 0)->outertext; $second_copy_quote_content = $second_copy_quote->find('div.copy_post_date', 0)->outertext; $second_copy_quote->outertext = "
Reposted ($second_copy_quote_author): $second_copy_quote_content"; } $copy_quote_author = $copy_quote->find('a.copy_author', 0)->outertext; $copy_quote_content = $copy_quote->innertext; $copy_quote->outertext = "
Reposted ($copy_quote_author):
$copy_quote_content"; } foreach ($post->find('.SecondaryAttachment') as $sa) { $sa_href = $sa->getAttribute('href'); if (!$sa_href) { $sa_href = ''; } $sa_task_click = $sa->getAttribute('data-task-click'); if (str_starts_with($sa_href, 'https://vk.com/doc')) { // document $doc_title = $sa->find('.SecondaryAttachment__childrenText', 0)->innertext; $doc_size = $sa->find('.SecondaryAttachmentSubhead', 0)->innertext; $doc_link = $sa_href; $content_suffix .= "
Doc: $doc_title ($doc_size)"; $sa->outertext = ''; } else if (str_starts_with($sa_href, 'https://vk.com/@')) { // article $article_title = $sa->find('.SecondaryAttachment__childrenText', 0)->innertext; $article_author = explode('Article · from ', $sa->find('.SecondaryAttachmentSubhead', 0)->innertext)[1]; $article_link = $sa_href; $content_suffix .= "
Article: $article_title ($article_author)"; $sa->outertext = ''; } else if ($sa_task_click == 'SecondaryAttachment/playAudio') { // audio $audio_json = json_decode(html_entity_decode($sa->getAttribute('data-audio'))); $audio_link = $audio_json->url; $audio_title = $sa->find('.SecondaryAttachment__childrenText', 0)->innertext; $audio_author = $sa->find('.SecondaryAttachmentSubhead', 0)->innertext; $content_suffix .= "
Audio: $audio_title ($audio_author)"; $sa->outertext = ''; } else if ($sa_task_click == 'SecondaryAttachment/playPlaylist') { // playlist link $playlist_title = $sa->find('.SecondaryAttachment__childrenText', 0)->innertext; $playlist_link = $sa->find('.SecondaryAttachment__link', 0)->getAttribute('href'); $content_suffix .= "
Playlist: $playlist_title"; $sa->outertext = ''; } } $item = []; $content = strip_tags(backgroundToImg($post->find('div.wall_text', 0)->innertext), '
'); $content .= $content_suffix; if (!$content) { $content = '(empty post)'; } $content = str_get_html($content); foreach ($content->find('img') as $img) { $parsed_src = parse_url($img->getAttribute('src')); // unblur images (case of impf) // get original images instead of thumbnails (case of impg) $imgPrefix = array_reduce(['/impf/', '/impg/'], function ($a, $c) use ($parsed_src) { if ($a) { return $a; } if (str_starts_with($parsed_src['path'], $c)) { return $c; } return $a; }, ''); if ($imgPrefix) { $new_src = $parsed_src['scheme'] . '://' . $parsed_src['host']; $new_src .= substr($parsed_src['path'], strlen($imgPrefix) - 1); $img->setAttribute('src', $new_src); } } $item['content'] = $content->outertext; $item['categories'] = $hashtags; // get post link $post_link = $post->find('a.PostHeaderSubtitle__link', 0)->getAttribute('href'); preg_match('/wall-?\d+_(\d+)/', $post_link, $preg_match_result); $item['post_id'] = intval($preg_match_result[1]); $item['uri'] = $post_link; $item['timestamp'] = $this->getTime($post); $item['title'] = $this->getTitle($item['content']); $item['author'] = $post_author; if ($is_pinned_post) { // do not append it now $pinned_post_item = $item; } else { $last_post_id = $item['post_id']; $this->items[] = $item; } } if (!is_null($pinned_post_item)) { if (count($this->items) == 0) { $this->items[] = $pinned_post_item; } elseif ($last_post_id < $pinned_post_item['post_id']) { $this->items[] = $pinned_post_item; usort($this->items, function ($item1, $item2) { return $item2['post_id'] - $item1['post_id']; }); } } } private function getPhoto($a) { $onclick = $a->getAttribute('onclick'); preg_match('/return showPhoto\(.+?({.*})/', $onclick, $preg_match_result); if (count($preg_match_result) == 0) { return; } $arg = htmlspecialchars_decode(str_replace('queue:1', '"queue":1', $preg_match_result[1])); $data = json_decode($arg, true); if ($data == null) { return; } $thumb = $data['temp']['base'] . $data['temp']['x_'][0]; $original = ''; foreach (['y_', 'z_', 'w_'] as $key) { if (!isset($data['temp'][$key])) { continue; } if (!isset($data['temp'][$key][0])) { continue; } if (substr($data['temp'][$key][0], 0, 4) == 'http') { $base = ''; } else { $base = $data['temp']['base']; } $original = $base . $data['temp'][$key][0]; } if ($original) { return "
"; } else { return backgroundToImg($a); } } private function getTitle($content) { $content = explode('
', $content)[0]; $content = strip_tags($content); preg_match('/^[:\,"\w\ \p{L}\(\)\?#«»\-\–\—||&\.%\\₽\/+\;\!]+/mu', htmlspecialchars_decode($content), $result); if (count($result) == 0) { return 'untitled'; } return $result[0]; } private function getTime($post) { $accurateDateElement = $post->find('span.rel_date', 0); if ($accurateDateElement) { return $accurateDateElement->getAttribute('time'); } else { $strdate = $post->find('time.PostHeaderSubtitle__item', 0)->plaintext; $strdate = preg_replace('/[\x00-\x1F\x7F-\xFF]/', ' ', $strdate); $date = date_parse($strdate); if (!$date['year']) { if (strstr($strdate, 'today') !== false) { $strdate = date('d-m-Y') . ' ' . $strdate; } elseif (strstr($strdate, 'yesterday ') !== false) { $time = time() - 60 * 60 * 24; $strdate = date('d-m-Y', $time) . ' ' . $strdate; } elseif ($date['month'] && intval(date('m')) < $date['month']) { $strdate = $strdate . ' ' . (date('Y') - 1); } else { $strdate = $strdate . ' ' . date('Y'); } $date = date_parse($strdate); } elseif ($date['hour'] === false) { $date['hour'] = $date['minute'] = '00'; } return strtotime($date['day'] . '-' . $date['month'] . '-' . $date['year'] . ' ' . $date['hour'] . ':' . $date['minute']) - $this->tz; } } private function getContents() { $httpHeaders = [ 'Accept-language: en', 'Cookie: remixlang=3', ]; $redirects = 0; $uri = $this->getURI(); while ($redirects < 2) { $response = getContents($uri, $httpHeaders, [CURLOPT_FOLLOWLOCATION => false], true); if (in_array($response['code'], [200, 304])) { return $response['content']; } $headers = $response['headers']; $uri = urljoin(self::URI, $headers['location'][0]); if (str_contains($uri, '/429.html')) { returnServerError('VK responded "Too many requests"'); } if (!preg_match('#^https?://vk.com/#', $uri)) { returnServerError('Unexpected redirect location'); } $redirects++; } returnServerError('Too many redirects, while retreving content from VK'); } protected function appendVideo($video_title, $video_link, $previewImg, &$content_suffix) { if (!$video_title) { $video_title = '(empty)'; } $content_suffix .= '
' . $previewImg; $content_suffix .= 'Video: ' . $video_title . ''; } }