From d9490c65181a090dfc743cd181caef2cc53ae8ef Mon Sep 17 00:00:00 2001
From: Nick McCarthy <nicholas.mccarthy@gmail.com>
Date: Sat, 10 Jun 2023 17:35:04 +0100
Subject: [PATCH] GoogleScholarV2Bridge (#3415)

* Added google scholar v2 bridge with more functionality

* Corrected Sort By interpretation (this is weird on Googles part)

* Remove some debug statements

* Merged GoogleScholarBridge and GoogleScholarV2Bridge into GoogleScholarBridge with two contexts.

* Left V2 in Bridge Name

* Lint

* Update GoogleScholarBridge.php

* Update GoogleScholarBridge.php

* Lint.

* ;
---
 bridges/GoogleScholarBridge.php | 268 +++++++++++++++++++++++++-------
 1 file changed, 215 insertions(+), 53 deletions(-)

diff --git a/bridges/GoogleScholarBridge.php b/bridges/GoogleScholarBridge.php
index 932efb5b..981355dd 100644
--- a/bridges/GoogleScholarBridge.php
+++ b/bridges/GoogleScholarBridge.php
@@ -2,19 +2,101 @@
 
 class GoogleScholarBridge extends BridgeAbstract
 {
-    const NAME = 'Goolge Scholar';
+    const NAME = 'Google Scholar v2';
     const URI = 'https://scholar.google.com/';
-    const DESCRIPTION = 'Follow authors of scientific publications.';
-    const MAINTAINER = 'thefranke';
+    const DESCRIPTION = 'Search for publications or follow authors on Google Scholar.';
+    const MAINTAINER = 'nicholasmccarthy';
     const CACHE_TIMEOUT = 86400; // 24h
 
-    const PARAMETERS = [[
-        'userId' => [
-            'name' => 'User ID',
-            'exampleValue' => 'qc6CJjYAAAAJ',
-            'required' => true
-        ]
-    ]];
+    const PARAMETERS = [
+        'user' => [
+            'userId' => [
+                'name' => 'User ID',
+                'exampleValue' => 'qc6CJjYAAAAJ',
+                'required' => true
+            ]
+        ],
+        'query' => [
+            'q' => [
+                'name' => 'Search Query',
+                'title' => 'Search Query',
+                'required' => true,
+                'exampleValue' => 'machine learning'
+            ],
+            'cites' => [
+                'name' => 'Cites',
+                'required' => false,
+                'default' => '',
+                'exampleValue' => '1275980731835430123',
+                'title' => 'Parameter defines unique ID for an article to trigger Cited By searches. Usage of cites
+                will bring up a list of citing documents in Google Scholar. Example value: cites=1275980731835430123.
+                Usage of cites and q parameters triggers search within citing articles.'
+            ],
+            'language' => [
+                'name' => 'Language',
+                'required' => false,
+                'default' => '',
+                'exampleValue' => 'en',
+                'title' => 'Parameter defines the language to use for the Google Scholar search. '
+            ],
+            'minCitations' => [
+                'name' => 'Minimum Citations',
+                'required' => false,
+                'type' => 'number',
+                'default' => '0',
+                'title' => 'Parameter defines the minimum number of citations in order for the results to be included.'
+            ],
+            'sinceYear' => [
+                'name' => 'Since Year',
+                'required' => false,
+                'type' => 'number',
+                'default' => '0',
+                'title' => 'Parameter defines the year from which you want the results to be included.'
+            ],
+            'untilYear' => [
+                'name' => 'Until Year',
+                'required' => false,
+                'type' => 'number',
+                'default' => '0',
+                'title' => 'Parameter defines the year until which you want the results to be included.'
+            ],
+            'sortBy' => [
+                'name' => 'Sort By Date',
+                'type' => 'checkbox',
+                'default' => false,
+                'title' => 'Parameter defines articles added in the last year, sorted by date. Alternatively sorts
+                by relevance. This overrides Since-Until Year values.',
+            ],
+            'includePatents' => [
+                'name' => 'Include Patents',
+                'type' => 'checkbox',
+                'default' => false,
+                'title' => 'Include Patents',
+            ],
+            'includeCitations' => [
+                'name' => 'Include Citations',
+                'type' => 'checkbox',
+                'default' => true,
+                'title' => 'Parameter defines whether you would like to include citations or not.',
+            ],
+            'reviewArticles' => [
+                'name' => 'Only Review Articles',
+                'type' => 'checkbox',
+                'default' => false,
+                'title' => 'Parameter defines whether you would like to show only review articles or not (these
+                articles consist of topic reviews, or discuss the works or authors you have searched for).',
+            ],
+            'numResults' => [
+                'name' => 'Number of Results (max 20)',
+                'required' => false,
+                'type' => 'number',
+                'default' => 10,
+                'exampleValue' => 10,
+                'title' => 'Number of results to return'
+            ]
+        ],
+    ];
+
 
     public function getIcon()
     {
@@ -23,58 +105,138 @@ class GoogleScholarBridge extends BridgeAbstract
 
     public function collectData()
     {
-        $uri = self::URI . '/citations?hl=en&view_op=list_works&sortby=pubdate&user=' . $this->getInput('userId');
+        switch ($this->queriedContext) {
+            case 'user':
+                $userId = $this->getInput('userId');
+                $uri = self::URI . '/citations?hl=en&view_op=list_works&sortby=pubdate&user=' . $userId;
+                $html = getSimpleHTMLDOM($uri) or returnServerError('Could not fetch Google Scholar data.');
 
-        $html = getSimpleHTMLDOM($uri)
-            or returnServerError('Could not fetch Google Scholar data.');
+                $publications = $html->find('tr[class="gsc_a_tr"]');
 
-        $publications = $html->find('tr[class="gsc_a_tr"]');
+                foreach ($publications as $publication) {
+                    $articleUrl = self::URI . htmlspecialchars_decode($publication->find('a[class="gsc_a_at"]', 0)->href);
+                    $articleTitle = $publication->find('a[class="gsc_a_at"]', 0)->plaintext;
 
-        foreach ($publications as $publication) {
-            $articleUrl = self::URI . htmlspecialchars_decode($publication->find('a[class="gsc_a_at"]', 0)->href);
-            $articleTitle = $publication->find('a[class="gsc_a_at"]', 0)->plaintext;
+                    # fetch the article itself to extract rest of content
+                    $contentArticle = getSimpleHTMLDOMCached($articleUrl);
+                    $articleEntries = $contentArticle->find('div[class="gs_scl"]');
 
-            # fetch the article itself to extract rest of content
-            $contentArticle = getSimpleHTMLDOMCached($articleUrl);
-            $articleEntries = $contentArticle->find('div[class="gs_scl"]');
+                    $articleDate = '';
+                    $articleAbstract = '';
+                    $articleAuthor = '';
+                    $content = '';
 
-            $articleDate = '';
-            $articleAbstract = '';
-            $articleAuthor = '';
-            $content = '';
+                    foreach ($articleEntries as $entry) {
+                        $field = $entry->find('div[class="gsc_oci_field"]', 0)->plaintext;
+                        $value = $entry->find('div[class="gsc_oci_value"]', 0)->plaintext;
 
-            foreach ($articleEntries as $entry) {
-                $field = $entry->find('div[class="gsc_oci_field"]', 0)->plaintext;
-                $value = $entry->find('div[class="gsc_oci_value"]', 0)->plaintext;
+                        if ($field == 'Publication date') {
+                            $articleDate = $value;
+                        } elseif ($field == 'Description') {
+                            $articleAbstract = $value;
+                        } elseif ($field == 'Authors') {
+                            $articleAuthor = $value;
+                        } elseif ($field == 'Scholar articles' || $field == 'Total citations') {
+                            continue;
+                        } else {
+                            $content = $content . $field . ': ' . $value . '<br><br>';
+                        }
+                    }
 
-                if ($field == 'Publication date') {
-                    $articleDate = $value;
-                } else if ($field == 'Description') {
-                    $articleAbstract = $value;
-                } else if ($field == 'Authors') {
-                    $articleAuthor = $value;
-                } else if ($field == 'Scholar articles' || $field == 'Total citations') {
-                    continue;
-                } else {
-                    $content = $content . $field . ': ' . $value . '<br><br>';
+                    $content = $content . $articleAbstract;
+
+                    $item = [];
+
+                    $item['title'] = $articleTitle;
+                    $item['uri'] = $articleUrl;
+                    $item['timestamp'] = strtotime($articleDate);
+                    $item['author'] = $articleAuthor;
+                    $item['content'] = $content;
+
+                    $this->items[] = $item;
+
+                    if (count($this->items) >= 10) {
+                        break;
+                    }
+                }
+                break;
+            case 'query':
+                $query = urlencode($this->getInput('q'));
+                $cites = $this->getInput('cites');
+                $language = $this->getInput('language');
+                $sinceYear = $this->getInput('sinceYear');
+                $untilYear = $this->getInput('untilYear');
+                $minCitations = (int)$this->getInput('minCitations');
+                $includeCitations = $this->getInput('includeCitations');
+                $includePatents = $this->getInput('includePatents');
+                $reviewArticles = $this->getInput('reviewArticles');
+                $sortBy = $this->getInput('sortBy');
+                $numResults = $this->getInput('numResults');
+
+                # Build URI
+                $uri = self::URI . 'scholar?q=' . $query;
+                $uri .= $sinceYear != 0 ? '&as_ylo=' . $sinceYear : '';
+                $uri .= $untilYear != 0 ? '&as_yhi=' . $untilYear : '';
+                $uri .= $language != '' ? '&hl=' . $language : '';
+                $uri .= $includePatents ? '&as_vis=7' : '&as_vis=0';
+                $uri .= $includeCitations ? '&as_vis=0' : ($includePatents ? '&as_vis=1' : '');
+                $uri .= $reviewArticles ? '&as_rr=1' : '';
+                $uri .= $sortBy ? '&scisbd=1' : '';
+                $uri .= $numResults ? '&num=' . $numResults : '';
+
+                $html = getSimpleHTMLDOM($uri) or returnServerError('Could not fetch Google Scholar data.');
+
+                $publications = $html->find('div[class="gs_r gs_or gs_scl"]');
+
+                foreach ($publications as $publication) {
+                    $articleTitleElement = $publication->find('h3[class="gs_rt"]', 0);
+                    $articleUrl = $articleTitleElement->find('a', 0)->href;
+                    $articleTitle = $articleTitleElement->plaintext;
+
+                    $articleDateElement = $publication->find('div[class="gs_a"]', 0);
+                    $articleDate = $articleDateElement ? $articleDateElement->plaintext : '';
+
+                    $articleAbstractElement = $publication->find('div[class="gs_rs"]', 0);
+                    $articleAbstract = $articleAbstractElement ? $articleAbstractElement->plaintext : '';
+
+                    $articleAuthorElement = $publication->find('div[class="gs_a"]', 0);
+                    $articleAuthor = $articleAuthorElement ? $articleAuthorElement->plaintext : '';
+
+                    $bottomRowElement = $publication->find('div[class="gs_fl"]', 0);
+
+                    $item = [
+                        'title' => $articleTitle,
+                        'uri' => $articleUrl,
+                        'timestamp' => strtotime($articleDate),
+                        'author' => $articleAuthor,
+                        'content' => $articleAbstract
+                    ];
+
+                    switch ($this->queriedContext) {
+                        case 'user':
+                            $this->items[] = $item;
+                            break;
+                        case 'query':
+                            $citedBy = 0;
+                            if ($bottomRowElement) {
+                                $anchorTags = $bottomRowElement->find('a');
+                                foreach ($anchorTags as $anchorTag) {
+                                    if (strpos($anchorTag->plaintext, 'Cited') !== false) {
+                                        $parts = explode('Cited by ', $anchorTag->plaintext);
+                                        if (isset($parts[1])) {
+                                            $citedBy = (int)$parts[1];
+                                        }
+                                        break;
+                                    }
+                                }
+                            }
+                            if ($citedBy >= $minCitations) {
+                                $this->items[] = $item;
+                            }
+                            break;
+                    }
                 }
-            }
-
-            $content = $content . $articleAbstract;
-
-            $item = [];
-
-            $item['title'] = $articleTitle;
-            $item['uri'] = $articleUrl;
-            $item['timestamp'] = strtotime($articleDate);
-            $item['author'] = $articleAuthor;
-            $item['content'] = $content;
-
-            $this->items[] = $item;
-
-            if (count($this->items) >= 10) {
                 break;
-            }
         }
     }
 }