Clean up search engine

Notable changes: 1. Prevent excessive engine module imports. 2. Replace trivial usage of `join()`. 3. Keep the output text sorted whenever possible. 4. Close handles properly. 5. Print error to stderr, not stdout. 6. Report search job exit code. 7. Print exception message to stderr if exception was thrown when running a search job. 8. Utilize XML library to build XML data And use 2 spaces as indentation. PR #21098.
2024-11-28 21:38:51 +03:00 · 2024-07-22 16:51:57 +08:00 · 2024-07-22 16:51:57 +08:00 · 69a829dfb0
commit 69a829dfb0
parent 3c5baac150
4 changed files with 168 additions and 198 deletions
--- a/src/base/search/searchpluginmanager.cpp
+++ b/src/base/search/searchpluginmanager.cpp
@ -367,14 +367,14 @@ QString SearchPluginManager::categoryFullName(const QString &categoryName)
    const QHash<QString, QString> categoryTable
    {
        {u"all"_s, tr("All categories")},
        {u"movies"_s, tr("Movies")},
        {u"tv"_s, tr("TV shows")},
        {u"music"_s, tr("Music")},
        {u"games"_s, tr("Games")},
        {u"anime"_s, tr("Anime")},
-        {u"software"_s, tr("Software")},
+        {u"books"_s, tr("Books")},
        {u"games"_s, tr("Games")},
        {u"movies"_s, tr("Movies")},
        {u"music"_s, tr("Music")},
        {u"pictures"_s, tr("Pictures")},
-        {u"books"_s, tr("Books")}
+        {u"software"_s, tr("Software")},
        {u"tv"_s, tr("TV shows")}
    };
    return categoryTable.value(categoryName);
 }
--- a/src/searchengine/nova3/helpers.py
+++ b/src/searchengine/nova3/helpers.py
@ -1,4 +1,4 @@
-#VERSION: 1.47
+#VERSION: 1.48
 # Author:
 #  Christophe DUMEZ (chris@qbittorrent.org)
@ -35,12 +35,12 @@ import os
 import re
 import socket
 import socks
 import sys
 import tempfile
 import urllib.error
 import urllib.parse
 import urllib.request
 from collections.abc import Mapping
-from typing import Any, Dict, Optional
+from typing import Any, Optional
 def getBrowserUserAgent() -> str:
@ -59,7 +59,7 @@ def getBrowserUserAgent() -> str:
    return f"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:{nowVersion}.0) Gecko/20100101 Firefox/{nowVersion}.0"
-headers: Dict[str, Any] = {'User-Agent': getBrowserUserAgent()}
+headers: dict[str, Any] = {'User-Agent': getBrowserUserAgent()}
 # SOCKS5 Proxy support
 if "sock_proxy" in os.environ and len(os.environ["sock_proxy"].strip()) > 0:
@ -91,51 +91,52 @@ def htmlentitydecode(s: str) -> str:
 def retrieve_url(url: str, custom_headers: Mapping[str, Any] = {}) -> str:
    """ Return the content of the url page as a string """
-    req = urllib.request.Request(url, headers={**headers, **custom_headers})
+
    request = urllib.request.Request(url, headers={**headers, **custom_headers})
    try:
-        response = urllib.request.urlopen(req)
+        response = urllib.request.urlopen(request)
    except urllib.error.URLError as errno:
-        print(" ".join(("Connection error:", str(errno.reason))))
+        print(f"Connection error: {errno.reason}", file=sys.stderr)
        return ""
-    dat: bytes = response.read()
+    data: bytes = response.read()
    # Check if it is gzipped
-    if dat[:2] == b'\x1f\x8b':
+    if data[:2] == b'\x1f\x8b':
        # Data is gzip encoded, decode it
-        compressedstream = io.BytesIO(dat)
+        with io.BytesIO(data) as compressedStream, gzip.GzipFile(fileobj=compressedStream) as gzipper:
-        gzipper = gzip.GzipFile(fileobj=compressedstream)
+            data = gzipper.read()
-        extracted_data = gzipper.read()
+
        dat = extracted_data
    info = response.info()
    charset = 'utf-8'
    try:
-        ignore, charset = info['Content-Type'].split('charset=')
+        charset = response.getheader('Content-Type', '').split('charset=', 1)[1]
-    except Exception:
+    except IndexError:
        pass
-    datStr = dat.decode(charset, 'replace')
+
-    datStr = htmlentitydecode(datStr)
+    dataStr = data.decode(charset, 'replace')
-    return datStr
+    dataStr = htmlentitydecode(dataStr)
    return dataStr
 def download_file(url: str, referer: Optional[str] = None) -> str:
    """ Download file at url and write it to a file, return the path to the file and the url """
-    fileHandle, path = tempfile.mkstemp()
+
    file = os.fdopen(fileHandle, "wb")
    # Download url
-    req = urllib.request.Request(url, headers=headers)
+    request = urllib.request.Request(url, headers=headers)
    if referer is not None:
-        req.add_header('referer', referer)
+        request.add_header('referer', referer)
-    response = urllib.request.urlopen(req)
+    response = urllib.request.urlopen(request)
-    dat = response.read()
+    data = response.read()
    # Check if it is gzipped
-    if dat[:2] == b'\x1f\x8b':
+    if data[:2] == b'\x1f\x8b':
        # Data is gzip encoded, decode it
-        compressedstream = io.BytesIO(dat)
+        with io.BytesIO(data) as compressedStream, gzip.GzipFile(fileobj=compressedStream) as gzipper:
-        gzipper = gzip.GzipFile(fileobj=compressedstream)
+            data = gzipper.read()
        extracted_data = gzipper.read()
        dat = extracted_data
    # Write it to a file
-    file.write(dat)
+    fileHandle, path = tempfile.mkstemp()
-    file.close()
+    with os.fdopen(fileHandle, "wb") as file:
        file.write(data)
    # return file path
-    return (path + " " + url)
+    return f"{path} {url}"
--- a/src/searchengine/nova3/nova2.py
+++ b/src/searchengine/nova3/nova2.py
@ -1,4 +1,4 @@
-#VERSION: 1.46
+#VERSION: 1.47
 # Author:
 #  Fabien Devaux <fab AT gnux DOT info>
@ -36,13 +36,15 @@
 import importlib
 import pathlib
 import sys
 import traceback
 import urllib.parse
-from collections.abc import Iterable, Iterator, Sequence
+import xml.etree.ElementTree as ET
 from collections.abc import Iterable
 from enum import Enum
 from glob import glob
 from multiprocessing import Pool, cpu_count
 from os import path
-from typing import Dict, List, Optional, Set, Tuple, Type
+from typing import Optional
 THREADED: bool = True
 try:
@ -50,7 +52,7 @@ try:
 except NotImplementedError:
    MAX_THREADS = 1
-Category = Enum('Category', ['all', 'movies', 'tv', 'music', 'games', 'anime', 'software', 'pictures', 'books'])
+Category = Enum('Category', ['all', 'anime', 'books', 'games', 'movies', 'music', 'pictures', 'software', 'tv'])
 ################################################################################
@ -62,13 +64,13 @@ Category = Enum('Category', ['all', 'movies', 'tv', 'music', 'games', 'anime', '
 ################################################################################
-EngineName = str
+EngineModuleName = str  # the filename of the engine plugin
 class Engine:
    url: str
-    name: EngineName
+    name: str
-    supported_categories: Dict[str, str]
+    supported_categories: dict[str, str]
    def __init__(self) -> None:
        pass
@ -81,112 +83,89 @@ class Engine:
 # global state
-engine_dict: Dict[EngineName, Optional[Type[Engine]]] = {}
+engine_dict: dict[EngineModuleName, Optional[type[Engine]]] = {}
-def list_engines() -> List[EngineName]:
+def list_engines() -> list[EngineModuleName]:
    """ List all engines,
-        including broken engines that fail on import
+        including broken engines that would fail on import
-        Faster than initialize_engines
+        Return list of all engines' module name
        Return list of all engines
    """
-    found_engines = []
+
    names = []
    for engine_path in glob(path.join(path.dirname(__file__), 'engines', '*.py')):
-        engine_name = path.basename(engine_path).split('.')[0].strip()
+        engine_module_name = path.basename(engine_path).split('.')[0].strip()
-        if len(engine_name) == 0 or engine_name.startswith('_'):
+        if len(engine_module_name) == 0 or engine_module_name.startswith('_'):
            continue
-        found_engines.append(engine_name)
+        names.append(engine_module_name)
-    return found_engines
+    return sorted(names)
-def get_engine(engine_name: EngineName) -> Optional[Type[Engine]]:
+def import_engine(engine_module_name: EngineModuleName) -> Optional[type[Engine]]:
-    if engine_name in engine_dict:
+    if engine_module_name in engine_dict:
-        return engine_dict[engine_name]
+        return engine_dict[engine_module_name]
-    # when import fails, engine is None
+    # when import fails, return `None`
-    engine = None
+    engine_class = None
    try:
-        # import engines.[engine]
+        # import engines.[engine_module_name]
-        engine_module = importlib.import_module("engines." + engine_name)
+        engine_module = importlib.import_module(f"engines.{engine_module_name}")
-        engine = getattr(engine_module, engine_name)
+        engine_class = getattr(engine_module, engine_module_name)
    except Exception:
        pass
-    engine_dict[engine_name] = engine
+
-    return engine
+    engine_dict[engine_module_name] = engine_class
    return engine_class
-def initialize_engines(found_engines: Iterable[EngineName]) -> Set[EngineName]:
+def get_capabilities(engines: Iterable[EngineModuleName]) -> str:
    """ Import available engines
        Return set of available engines
    """
-    supported_engines = set()
+    Return capabilities in XML format
    for engine_name in found_engines:
        # import engine
        engine = get_engine(engine_name)
        if engine is None:
            continue
        supported_engines.add(engine_name)
    return supported_engines
 def engines_to_xml(supported_engines: Iterable[EngineName]) -> Iterator[str]:
    """ Generates xml for supported engines """
    tab = " " * 4
    for engine_name in supported_engines:
        search_engine = get_engine(engine_name)
        if search_engine is None:
            continue
        supported_categories = ""
        if hasattr(search_engine, "supported_categories"):
            supported_categories = " ".join((key
                                             for key in search_engine.supported_categories.keys()
                                             if key != Category.all.name))
        yield "".join((tab, "<", engine_name, ">\n",
                       tab, tab, "<name>", search_engine.name, "</name>\n",
                       tab, tab, "<url>", search_engine.url, "</url>\n",
                       tab, tab, "<categories>", supported_categories, "</categories>\n",
                       tab, "</", engine_name, ">\n"))
 def displayCapabilities(supported_engines: Iterable[EngineName]) -> None:
    """
    Display capabilities in XML format
    <capabilities>
-      <engine_short_name>
+      <engine_module_name>
        <name>long name</name>
        <url>http://example.com</url>
        <categories>movies music games</categories>
-      </engine_short_name>
+      </engine_module_name>
    </capabilities>
    """
-    xml = "".join(("<capabilities>\n",
+
-                   "".join(engines_to_xml(supported_engines)),
+    capabilities_element = ET.Element('capabilities')
-                   "</capabilities>"))
+
-    print(xml)
+    for engine_module_name in engines:
        engine_class = import_engine(engine_module_name)
        if engine_class is None:
            continue
        engine_module_element = ET.SubElement(capabilities_element, engine_module_name)
        ET.SubElement(engine_module_element, 'name').text = engine_class.name
        ET.SubElement(engine_module_element, 'url').text = engine_class.url
        supported_categories = ""
        if hasattr(engine_class, "supported_categories"):
            supported_categories = " ".join((key
                                             for key in sorted(engine_class.supported_categories.keys())
                                             if key != Category.all.name))
        ET.SubElement(engine_module_element, 'categories').text = supported_categories
    ET.indent(capabilities_element)
    return ET.tostring(capabilities_element, 'unicode')
-def run_search(engine_list: Tuple[Optional[Type[Engine]], str, Category]) -> bool:
+def run_search(search_params: tuple[type[Engine], str, Category]) -> bool:
    """ Run search in engine
-        @param engine_list Tuple with engine, query and category
+        @param search_params Tuple with engine, query and category
        @retval False if any exceptions occurred
        @retval True  otherwise
    """
    engine_class, what, cat = engine_list
    if engine_class is None:
        return False
    engine_class, what, cat = search_params
    try:
        engine = engine_class()
        # avoid exceptions due to invalid category
@ -195,73 +174,65 @@ def run_search(engine_list: Tuple[Optional[Type[Engine]], str, Category]) -> boo
                engine.search(what, cat.name)
        else:
            engine.search(what)
        return True
    except Exception:
        traceback.print_exc()
        return False
-def main(args: Sequence[str]) -> None:
+if __name__ == "__main__":
    def main() -> int:
        # qbt tend to run this script in 'isolate mode' so append the current path manually
        current_path = str(pathlib.Path(__file__).parent.resolve())
        if current_path not in sys.path:
            sys.path.append(current_path)
        # https://docs.python.org/3/library/sys.html#sys.exit
        class ExitCode(Enum):
            OK = 0
            AppError = 1
            ArgError = 2
        found_engines = list_engines()
-    def show_usage() -> None:
+        prog_name = sys.argv[0]
-        print("./nova2.py all|engine1[,engine2]* <category> <keywords>", file=sys.stderr)
+        prog_usage = (f"Usage: {prog_name} all|engine1[,engine2]* <category> <keywords>\n"
-        print("found engines: " + ','.join(found_engines), file=sys.stderr)
+                      f"To list available engines: {prog_name} --capabilities [--names]\n"
-        print("to list available engines: ./nova2.py --capabilities [--names]", file=sys.stderr)
+                      f"Found engines: {','.join(found_engines)}")
-    if not args:
+        if "--capabilities" in sys.argv:
-        show_usage()
+            if "--names" in sys.argv:
-        sys.exit(1)
+                print(",".join((e for e in found_engines if import_engine(e) is not None)))
-    elif args[0] == "--capabilities":
+                return ExitCode.OK.value
        supported_engines = initialize_engines(found_engines)
        if "--names" in args:
            print(",".join(supported_engines))
            return
        displayCapabilities(supported_engines)
        return
    elif len(args) < 3:
        show_usage()
        sys.exit(1)
-    cat = args[1].lower()
+            print(get_capabilities(found_engines))
            return ExitCode.OK.value
        elif len(sys.argv) < 4:
            print(prog_usage, file=sys.stderr)
            return ExitCode.ArgError.value
        # get unique engines
        engs = set(arg.strip().lower() for arg in sys.argv[1].split(','))
        engines = found_engines if 'all' in engs else [e for e in found_engines if e in engs]
        cat = sys.argv[2].lower()
        try:
            category = Category[cat]
        except KeyError:
-        print(" - ".join(('Invalid category', cat)), file=sys.stderr)
+            print(f"Invalid category: {cat}", file=sys.stderr)
-        sys.exit(1)
+            return ExitCode.ArgError.value
-    # get only unique engines with set
+        what = urllib.parse.quote(' '.join(sys.argv[3:]))
-    engines_list = set(e.lower() for e in args[0].strip().split(','))
+        params = ((engine_class, what, category) for e in engines if (engine_class := import_engine(e)) is not None)
    if not engines_list:
        # engine list is empty. Nothing to do here
        return
    if 'all' in engines_list:
        # use all supported engines
        # note: this can be slower than passing a list of supported engines
        # because initialize_engines will also try to import not-supported engines
        engines_list = initialize_engines(found_engines)
    else:
        # discard not-found engines
        engines_list = {engine for engine in engines_list if engine in found_engines}
    what = urllib.parse.quote(' '.join(args[2:]))
    params = ((get_engine(engine_name), what, category) for engine_name in engines_list)
        search_success = False
        if THREADED:
-        # child process spawning is controlled min(number of searches, number of cpu)
+            processes = max(min(len(engines), MAX_THREADS), 1)
-        with Pool(min(len(engines_list), MAX_THREADS)) as pool:
+            with Pool(processes) as pool:
-            pool.map(run_search, params)
+                search_success = all(pool.map(run_search, params))
        else:
-        # py3 note: map is needed to be evaluated for content to be executed
+            search_success = all(map(run_search, params))
        all(map(run_search, params))
        return ExitCode.OK.value if search_success else ExitCode.AppError.value
-if __name__ == "__main__":
+    sys.exit(main())
    main(sys.argv[1:])
--- a/src/searchengine/nova3/novaprinter.py
+++ b/src/searchengine/nova3/novaprinter.py
@ -1,4 +1,4 @@
-#VERSION: 1.50
+#VERSION: 1.51
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
@ -25,21 +25,18 @@
 # POSSIBILITY OF SUCH DAMAGE.
 import re
-from collections.abc import Mapping
+from typing import TypedDict, Union
 from typing import Any, Union
-# TODO: enable the following when using Python >= 3.8
+SearchResults = TypedDict('SearchResults', {
-#SearchResults = TypedDict('SearchResults', {
+    'link': str,
-#    'link': str,
+    'name': str,
-#    'name': str,
+    'size': Union[float, int, str],  # TODO: use `float | int | str` when using Python >= 3.10
-#    'size': Union[float, int, str],
+    'seeds': int,
-#    'seeds': int,
+    'leech': int,
-#    'leech': int,
+    'engine_url': str,
-#    'engine_url': str,
+    'desc_link': str,  # Optional  # TODO: use `NotRequired[str]` when using Python >= 3.11
-#    'desc_link': str,  # Optional  # TODO: use `NotRequired[str]` when using Python >= 3.11
+    'pub_date': int  # Optional  # TODO: use `NotRequired[int]` when using Python >= 3.11
-#    'pub_date': int  # Optional  # TODO: use `NotRequired[int]` when using Python >= 3.11
+})
 #})
 SearchResults = Mapping[str, Any]
 def prettyPrinter(dictionary: SearchResults) -> None:
@ -62,6 +59,7 @@ def prettyPrinter(dictionary: SearchResults) -> None:
 sizeUnitRegex: re.Pattern[str] = re.compile(r"^(?P<size>\d*\.?\d+) *(?P<unit>[a-z]+)?", re.IGNORECASE)
 # TODO: use `float | int | str` when using Python >= 3.10
 def anySizeToBytes(size_string: Union[float, int, str]) -> int:
    """
    Convert a string like '1 KB' to '1024' (bytes)