2016-08-04 18:08:32 +03:00
|
|
|
|
#
|
2023-11-21 23:29:58 +03:00
|
|
|
|
# This file is licensed under the Affero General Public License (AGPL) version 3.
|
|
|
|
|
#
|
2024-01-23 14:26:48 +03:00
|
|
|
|
# Copyright 2014-2016 OpenMarket Ltd
|
2023-11-21 23:29:58 +03:00
|
|
|
|
# Copyright (C) 2023 New Vector, Ltd
|
|
|
|
|
#
|
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
|
|
|
# it under the terms of the GNU Affero General Public License as
|
|
|
|
|
# published by the Free Software Foundation, either version 3 of the
|
|
|
|
|
# License, or (at your option) any later version.
|
|
|
|
|
#
|
|
|
|
|
# See the GNU Affero General Public License for more details:
|
|
|
|
|
# <https://www.gnu.org/licenses/agpl-3.0.html>.
|
|
|
|
|
#
|
|
|
|
|
# Originally licensed under the Apache License, Version 2.0:
|
|
|
|
|
# <http://www.apache.org/licenses/LICENSE-2.0>.
|
|
|
|
|
#
|
|
|
|
|
# [This file includes modifications made by New Vector Limited]
|
2016-08-04 18:08:32 +03:00
|
|
|
|
#
|
|
|
|
|
#
|
|
|
|
|
|
2023-02-27 16:26:05 +03:00
|
|
|
|
from synapse.media.preview_html import (
|
2021-12-13 20:55:07 +03:00
|
|
|
|
_get_html_media_encodings,
|
2021-10-08 21:14:42 +03:00
|
|
|
|
decode_body,
|
2021-12-13 20:55:07 +03:00
|
|
|
|
parse_html_to_open_graph,
|
2018-07-09 09:09:20 +03:00
|
|
|
|
summarize_paragraphs,
|
2016-08-16 16:53:18 +03:00
|
|
|
|
)
|
2016-08-04 18:08:32 +03:00
|
|
|
|
|
2022-01-24 16:58:18 +03:00
|
|
|
|
from tests import unittest
|
2018-07-09 09:09:20 +03:00
|
|
|
|
|
2021-01-07 14:41:28 +03:00
|
|
|
|
try:
|
|
|
|
|
import lxml
|
|
|
|
|
except ImportError:
|
2023-05-31 20:06:57 +03:00
|
|
|
|
lxml = None # type: ignore[assignment]
|
2021-01-07 14:41:28 +03:00
|
|
|
|
|
2016-08-04 18:08:32 +03:00
|
|
|
|
|
2021-02-08 20:33:30 +03:00
|
|
|
|
class SummarizeTestCase(unittest.TestCase):
|
2021-01-07 14:41:28 +03:00
|
|
|
|
if not lxml:
|
|
|
|
|
skip = "url preview feature requires lxml"
|
|
|
|
|
|
2022-03-03 19:05:44 +03:00
|
|
|
|
def test_long_summarize(self) -> None:
|
2016-08-04 18:08:32 +03:00
|
|
|
|
example_paras = [
|
2016-12-05 17:39:54 +03:00
|
|
|
|
"""Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:
|
2016-08-04 18:08:32 +03:00
|
|
|
|
Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in
|
|
|
|
|
Troms county, Norway. The administrative centre of the municipality is
|
|
|
|
|
the city of Tromsø. Outside of Norway, Tromso and Tromsö are
|
|
|
|
|
alternative spellings of the city.Tromsø is considered the northernmost
|
|
|
|
|
city in the world with a population above 50,000. The most populous town
|
|
|
|
|
north of it is Alta, Norway, with a population of 14,272 (2013).""",
|
2016-12-05 17:39:54 +03:00
|
|
|
|
"""Tromsø lies in Northern Norway. The municipality has a population of
|
2016-08-04 18:08:32 +03:00
|
|
|
|
(2015) 72,066, but with an annual influx of students it has over 75,000
|
|
|
|
|
most of the year. It is the largest urban area in Northern Norway and the
|
|
|
|
|
third largest north of the Arctic Circle (following Murmansk and Norilsk).
|
|
|
|
|
Most of Tromsø, including the city centre, is located on the island of
|
|
|
|
|
Tromsøya, 350 kilometres (217 mi) north of the Arctic Circle. In 2012,
|
|
|
|
|
Tromsøya had a population of 36,088. Substantial parts of the urban area
|
|
|
|
|
are also situated on the mainland to the east, and on parts of Kvaløya—a
|
|
|
|
|
large island to the west. Tromsøya is connected to the mainland by the Tromsø
|
|
|
|
|
Bridge and the Tromsøysund Tunnel, and to the island of Kvaløya by the
|
|
|
|
|
Sandnessund Bridge. Tromsø Airport connects the city to many destinations
|
|
|
|
|
in Europe. The city is warmer than most other places located on the same
|
|
|
|
|
latitude, due to the warming effect of the Gulf Stream.""",
|
2016-12-05 17:39:54 +03:00
|
|
|
|
"""The city centre of Tromsø contains the highest number of old wooden
|
2016-08-04 18:08:32 +03:00
|
|
|
|
houses in Northern Norway, the oldest house dating from 1789. The Arctic
|
|
|
|
|
Cathedral, a modern church from 1965, is probably the most famous landmark
|
|
|
|
|
in Tromsø. The city is a cultural centre for its region, with several
|
|
|
|
|
festivals taking place in the summer. Some of Norway's best-known
|
|
|
|
|
musicians, Torbjørn Brundtland and Svein Berge of the electronica duo
|
|
|
|
|
Röyksopp and Lene Marlin grew up and started their careers in Tromsø.
|
|
|
|
|
Noted electronic musician Geir Jenssen also hails from Tromsø.""",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
desc = summarize_paragraphs(example_paras, min_size=200, max_size=500)
|
|
|
|
|
|
2020-12-07 18:00:08 +03:00
|
|
|
|
self.assertEqual(
|
2016-08-04 18:08:32 +03:00
|
|
|
|
desc,
|
2016-12-05 17:39:54 +03:00
|
|
|
|
"Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
|
|
|
|
|
" Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
|
|
|
|
|
" Troms county, Norway. The administrative centre of the municipality is"
|
|
|
|
|
" the city of Tromsø. Outside of Norway, Tromso and Tromsö are"
|
|
|
|
|
" alternative spellings of the city.Tromsø is considered the northernmost"
|
|
|
|
|
" city in the world with a population above 50,000. The most populous town"
|
|
|
|
|
" north of it is Alta, Norway, with a population of 14,272 (2013).",
|
2016-08-04 18:08:32 +03:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
desc = summarize_paragraphs(example_paras[1:], min_size=200, max_size=500)
|
|
|
|
|
|
2020-12-07 18:00:08 +03:00
|
|
|
|
self.assertEqual(
|
2016-08-04 18:08:32 +03:00
|
|
|
|
desc,
|
2016-12-05 17:39:54 +03:00
|
|
|
|
"Tromsø lies in Northern Norway. The municipality has a population of"
|
|
|
|
|
" (2015) 72,066, but with an annual influx of students it has over 75,000"
|
|
|
|
|
" most of the year. It is the largest urban area in Northern Norway and the"
|
|
|
|
|
" third largest north of the Arctic Circle (following Murmansk and Norilsk)."
|
|
|
|
|
" Most of Tromsø, including the city centre, is located on the island of"
|
|
|
|
|
" Tromsøya, 350 kilometres (217 mi) north of the Arctic Circle. In 2012,"
|
2016-12-05 17:40:23 +03:00
|
|
|
|
" Tromsøya had a population of 36,088. Substantial parts of the urban…",
|
2016-08-04 18:08:32 +03:00
|
|
|
|
)
|
|
|
|
|
|
2022-03-03 19:05:44 +03:00
|
|
|
|
def test_short_summarize(self) -> None:
|
2016-08-04 18:08:32 +03:00
|
|
|
|
example_paras = [
|
2016-12-05 17:39:54 +03:00
|
|
|
|
"Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
|
|
|
|
|
" Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
|
|
|
|
|
" Troms county, Norway.",
|
|
|
|
|
"Tromsø lies in Northern Norway. The municipality has a population of"
|
|
|
|
|
" (2015) 72,066, but with an annual influx of students it has over 75,000"
|
|
|
|
|
" most of the year.",
|
|
|
|
|
"The city centre of Tromsø contains the highest number of old wooden"
|
|
|
|
|
" houses in Northern Norway, the oldest house dating from 1789. The Arctic"
|
|
|
|
|
" Cathedral, a modern church from 1965, is probably the most famous landmark"
|
|
|
|
|
" in Tromsø.",
|
2016-08-04 18:08:32 +03:00
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
desc = summarize_paragraphs(example_paras, min_size=200, max_size=500)
|
|
|
|
|
|
2020-12-07 18:00:08 +03:00
|
|
|
|
self.assertEqual(
|
2016-08-04 18:08:32 +03:00
|
|
|
|
desc,
|
2016-12-05 17:39:54 +03:00
|
|
|
|
"Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
|
|
|
|
|
" Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
|
|
|
|
|
" Troms county, Norway.\n"
|
|
|
|
|
"\n"
|
|
|
|
|
"Tromsø lies in Northern Norway. The municipality has a population of"
|
|
|
|
|
" (2015) 72,066, but with an annual influx of students it has over 75,000"
|
|
|
|
|
" most of the year.",
|
2016-08-04 18:08:32 +03:00
|
|
|
|
)
|
|
|
|
|
|
2022-03-03 19:05:44 +03:00
|
|
|
|
def test_small_then_large_summarize(self) -> None:
|
2016-08-04 18:08:32 +03:00
|
|
|
|
example_paras = [
|
2016-12-05 17:39:54 +03:00
|
|
|
|
"Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
|
|
|
|
|
" Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
|
|
|
|
|
" Troms county, Norway.",
|
|
|
|
|
"Tromsø lies in Northern Norway. The municipality has a population of"
|
|
|
|
|
" (2015) 72,066, but with an annual influx of students it has over 75,000"
|
|
|
|
|
" most of the year."
|
|
|
|
|
" The city centre of Tromsø contains the highest number of old wooden"
|
|
|
|
|
" houses in Northern Norway, the oldest house dating from 1789. The Arctic"
|
|
|
|
|
" Cathedral, a modern church from 1965, is probably the most famous landmark"
|
|
|
|
|
" in Tromsø.",
|
2016-08-04 18:08:32 +03:00
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
desc = summarize_paragraphs(example_paras, min_size=200, max_size=500)
|
2020-12-07 18:00:08 +03:00
|
|
|
|
self.assertEqual(
|
2016-08-04 18:08:32 +03:00
|
|
|
|
desc,
|
2016-12-05 17:39:54 +03:00
|
|
|
|
"Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
|
|
|
|
|
" Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
|
|
|
|
|
" Troms county, Norway.\n"
|
|
|
|
|
"\n"
|
|
|
|
|
"Tromsø lies in Northern Norway. The municipality has a population of"
|
|
|
|
|
" (2015) 72,066, but with an annual influx of students it has over 75,000"
|
|
|
|
|
" most of the year. The city centre of Tromsø contains the highest number"
|
|
|
|
|
" of old wooden houses in Northern Norway, the oldest house dating from"
|
2016-12-05 17:40:23 +03:00
|
|
|
|
" 1789. The Arctic Cathedral, a modern church from…",
|
2016-08-04 18:08:32 +03:00
|
|
|
|
)
|
2016-08-16 16:53:18 +03:00
|
|
|
|
|
|
|
|
|
|
2022-06-03 19:09:12 +03:00
|
|
|
|
class OpenGraphFromHtmlTestCase(unittest.TestCase):
|
2021-01-07 14:41:28 +03:00
|
|
|
|
if not lxml:
|
|
|
|
|
skip = "url preview feature requires lxml"
|
|
|
|
|
|
2022-03-03 19:05:44 +03:00
|
|
|
|
def test_simple(self) -> None:
|
2021-02-08 20:33:30 +03:00
|
|
|
|
html = b"""
|
2016-08-16 16:53:18 +03:00
|
|
|
|
<html>
|
|
|
|
|
<head><title>Foo</title></head>
|
|
|
|
|
<body>
|
|
|
|
|
Some text.
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
"""
|
|
|
|
|
|
2021-10-14 17:17:20 +03:00
|
|
|
|
tree = decode_body(html, "http://example.com/test.html")
|
2023-05-31 20:06:57 +03:00
|
|
|
|
assert tree is not None
|
2022-03-16 14:21:36 +03:00
|
|
|
|
og = parse_html_to_open_graph(tree)
|
2016-08-16 16:53:18 +03:00
|
|
|
|
|
2020-12-07 18:00:08 +03:00
|
|
|
|
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
2016-08-16 16:53:18 +03:00
|
|
|
|
|
2022-03-03 19:05:44 +03:00
|
|
|
|
def test_comment(self) -> None:
|
2021-02-08 20:33:30 +03:00
|
|
|
|
html = b"""
|
2016-08-16 16:53:18 +03:00
|
|
|
|
<html>
|
|
|
|
|
<head><title>Foo</title></head>
|
|
|
|
|
<body>
|
|
|
|
|
<!-- HTML comment -->
|
|
|
|
|
Some text.
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
"""
|
|
|
|
|
|
2021-10-14 17:17:20 +03:00
|
|
|
|
tree = decode_body(html, "http://example.com/test.html")
|
2023-05-31 20:06:57 +03:00
|
|
|
|
assert tree is not None
|
2022-03-16 14:21:36 +03:00
|
|
|
|
og = parse_html_to_open_graph(tree)
|
2016-08-16 16:53:18 +03:00
|
|
|
|
|
2020-12-07 18:00:08 +03:00
|
|
|
|
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
2016-08-16 16:53:18 +03:00
|
|
|
|
|
2022-03-03 19:05:44 +03:00
|
|
|
|
def test_comment2(self) -> None:
|
2021-02-08 20:33:30 +03:00
|
|
|
|
html = b"""
|
2016-08-16 16:53:18 +03:00
|
|
|
|
<html>
|
|
|
|
|
<head><title>Foo</title></head>
|
|
|
|
|
<body>
|
|
|
|
|
Some text.
|
|
|
|
|
<!-- HTML comment -->
|
|
|
|
|
Some more text.
|
|
|
|
|
<p>Text</p>
|
|
|
|
|
More text
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
"""
|
|
|
|
|
|
2021-10-14 17:17:20 +03:00
|
|
|
|
tree = decode_body(html, "http://example.com/test.html")
|
2023-05-31 20:06:57 +03:00
|
|
|
|
assert tree is not None
|
2022-03-16 14:21:36 +03:00
|
|
|
|
og = parse_html_to_open_graph(tree)
|
2016-08-16 16:53:18 +03:00
|
|
|
|
|
2020-12-07 18:00:08 +03:00
|
|
|
|
self.assertEqual(
|
2016-08-16 16:53:18 +03:00
|
|
|
|
og,
|
|
|
|
|
{
|
2016-12-05 17:39:54 +03:00
|
|
|
|
"og:title": "Foo",
|
|
|
|
|
"og:description": "Some text.\n\nSome more text.\n\nText\n\nMore text",
|
2016-08-16 16:53:18 +03:00
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
|
2022-03-03 19:05:44 +03:00
|
|
|
|
def test_script(self) -> None:
|
2021-02-08 20:33:30 +03:00
|
|
|
|
html = b"""
|
2016-08-16 16:53:18 +03:00
|
|
|
|
<html>
|
|
|
|
|
<head><title>Foo</title></head>
|
|
|
|
|
<body>
|
|
|
|
|
<script> (function() {})() </script>
|
|
|
|
|
Some text.
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
"""
|
|
|
|
|
|
2021-10-14 17:17:20 +03:00
|
|
|
|
tree = decode_body(html, "http://example.com/test.html")
|
2023-05-31 20:06:57 +03:00
|
|
|
|
assert tree is not None
|
2022-03-16 14:21:36 +03:00
|
|
|
|
og = parse_html_to_open_graph(tree)
|
2016-08-16 16:53:18 +03:00
|
|
|
|
|
2020-12-07 18:00:08 +03:00
|
|
|
|
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
2016-12-15 00:38:18 +03:00
|
|
|
|
|
2022-03-03 19:05:44 +03:00
|
|
|
|
def test_missing_title(self) -> None:
|
2021-02-08 20:33:30 +03:00
|
|
|
|
html = b"""
|
2016-12-15 00:38:18 +03:00
|
|
|
|
<html>
|
|
|
|
|
<body>
|
|
|
|
|
Some text.
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
"""
|
|
|
|
|
|
2021-10-14 17:17:20 +03:00
|
|
|
|
tree = decode_body(html, "http://example.com/test.html")
|
2023-05-31 20:06:57 +03:00
|
|
|
|
assert tree is not None
|
2022-03-16 14:21:36 +03:00
|
|
|
|
og = parse_html_to_open_graph(tree)
|
2016-12-15 00:38:18 +03:00
|
|
|
|
|
2020-12-07 18:00:08 +03:00
|
|
|
|
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
|
2016-12-15 00:38:18 +03:00
|
|
|
|
|
2022-06-03 19:09:12 +03:00
|
|
|
|
# Another variant is a title with no content.
|
|
|
|
|
html = b"""
|
|
|
|
|
<html>
|
|
|
|
|
<head><title></title></head>
|
|
|
|
|
<body>
|
|
|
|
|
<h1>Title</h1>
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
tree = decode_body(html, "http://example.com/test.html")
|
2023-05-31 20:06:57 +03:00
|
|
|
|
assert tree is not None
|
2022-06-03 19:09:12 +03:00
|
|
|
|
og = parse_html_to_open_graph(tree)
|
|
|
|
|
|
|
|
|
|
self.assertEqual(og, {"og:title": "Title", "og:description": "Title"})
|
|
|
|
|
|
2022-03-03 19:05:44 +03:00
|
|
|
|
def test_h1_as_title(self) -> None:
|
2021-02-08 20:33:30 +03:00
|
|
|
|
html = b"""
|
2016-12-15 00:38:18 +03:00
|
|
|
|
<html>
|
|
|
|
|
<meta property="og:description" content="Some text."/>
|
|
|
|
|
<body>
|
|
|
|
|
<h1>Title</h1>
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
"""
|
|
|
|
|
|
2021-10-14 17:17:20 +03:00
|
|
|
|
tree = decode_body(html, "http://example.com/test.html")
|
2023-05-31 20:06:57 +03:00
|
|
|
|
assert tree is not None
|
2022-03-16 14:21:36 +03:00
|
|
|
|
og = parse_html_to_open_graph(tree)
|
2016-12-15 00:38:18 +03:00
|
|
|
|
|
2020-12-07 18:00:08 +03:00
|
|
|
|
self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
|
2016-12-15 00:38:18 +03:00
|
|
|
|
|
2022-06-03 19:09:12 +03:00
|
|
|
|
def test_empty_description(self) -> None:
|
|
|
|
|
"""Description tags with empty content should be ignored."""
|
|
|
|
|
html = b"""
|
|
|
|
|
<html>
|
|
|
|
|
<meta property="og:description" content=""/>
|
|
|
|
|
<meta property="og:description"/>
|
|
|
|
|
<meta name="description" content=""/>
|
|
|
|
|
<meta name="description"/>
|
|
|
|
|
<meta name="description" content="Finally!"/>
|
|
|
|
|
<body>
|
|
|
|
|
<h1>Title</h1>
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
tree = decode_body(html, "http://example.com/test.html")
|
2023-05-31 20:06:57 +03:00
|
|
|
|
assert tree is not None
|
2022-06-03 19:09:12 +03:00
|
|
|
|
og = parse_html_to_open_graph(tree)
|
|
|
|
|
|
|
|
|
|
self.assertEqual(og, {"og:title": "Title", "og:description": "Finally!"})
|
|
|
|
|
|
2022-03-03 19:05:44 +03:00
|
|
|
|
def test_missing_title_and_broken_h1(self) -> None:
|
2021-02-08 20:33:30 +03:00
|
|
|
|
html = b"""
|
2016-12-15 00:38:18 +03:00
|
|
|
|
<html>
|
|
|
|
|
<body>
|
|
|
|
|
<h1><a href="foo"/></h1>
|
|
|
|
|
Some text.
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
"""
|
|
|
|
|
|
2021-10-14 17:17:20 +03:00
|
|
|
|
tree = decode_body(html, "http://example.com/test.html")
|
2023-05-31 20:06:57 +03:00
|
|
|
|
assert tree is not None
|
2022-03-16 14:21:36 +03:00
|
|
|
|
og = parse_html_to_open_graph(tree)
|
2016-12-15 00:38:18 +03:00
|
|
|
|
|
2020-12-07 18:00:08 +03:00
|
|
|
|
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
|
|
|
|
|
|
2022-03-03 19:05:44 +03:00
|
|
|
|
def test_empty(self) -> None:
|
2021-02-08 20:33:30 +03:00
|
|
|
|
"""Test a body with no data in it."""
|
|
|
|
|
html = b""
|
2021-10-14 17:17:20 +03:00
|
|
|
|
tree = decode_body(html, "http://example.com/test.html")
|
2021-10-08 21:14:42 +03:00
|
|
|
|
self.assertIsNone(tree)
|
2021-02-08 20:33:30 +03:00
|
|
|
|
|
2022-03-03 19:05:44 +03:00
|
|
|
|
def test_no_tree(self) -> None:
|
2021-02-08 20:33:30 +03:00
|
|
|
|
"""A valid body with no tree in it."""
|
|
|
|
|
html = b"\x00"
|
2021-10-14 17:17:20 +03:00
|
|
|
|
tree = decode_body(html, "http://example.com/test.html")
|
2021-10-08 21:14:42 +03:00
|
|
|
|
self.assertIsNone(tree)
|
2021-01-26 15:32:17 +03:00
|
|
|
|
|
2022-03-03 19:05:44 +03:00
|
|
|
|
def test_xml(self) -> None:
|
2021-10-27 17:48:02 +03:00
|
|
|
|
"""Test decoding XML and ensure it works properly."""
|
|
|
|
|
# Note that the strip() call is important to ensure the xml tag starts
|
|
|
|
|
# at the initial byte.
|
|
|
|
|
html = b"""
|
|
|
|
|
<?xml version="1.0" encoding="UTF-8"?>
|
|
|
|
|
|
|
|
|
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
|
|
|
|
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
|
|
|
|
<head><title>Foo</title></head><body>Some text.</body></html>
|
|
|
|
|
""".strip()
|
|
|
|
|
tree = decode_body(html, "http://example.com/test.html")
|
2023-05-31 20:06:57 +03:00
|
|
|
|
assert tree is not None
|
2022-03-16 14:21:36 +03:00
|
|
|
|
og = parse_html_to_open_graph(tree)
|
2021-10-27 17:48:02 +03:00
|
|
|
|
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
|
|
|
|
|
2022-03-03 19:05:44 +03:00
|
|
|
|
def test_invalid_encoding(self) -> None:
|
2021-01-26 15:32:17 +03:00
|
|
|
|
"""An invalid character encoding should be ignored and treated as UTF-8, if possible."""
|
2021-02-08 20:33:30 +03:00
|
|
|
|
html = b"""
|
2021-01-26 15:32:17 +03:00
|
|
|
|
<html>
|
|
|
|
|
<head><title>Foo</title></head>
|
|
|
|
|
<body>
|
|
|
|
|
Some text.
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
"""
|
2021-10-14 17:17:20 +03:00
|
|
|
|
tree = decode_body(html, "http://example.com/test.html", "invalid-encoding")
|
2023-05-31 20:06:57 +03:00
|
|
|
|
assert tree is not None
|
2022-03-16 14:21:36 +03:00
|
|
|
|
og = parse_html_to_open_graph(tree)
|
2021-01-26 15:32:17 +03:00
|
|
|
|
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
|
|
|
|
|
2022-03-03 19:05:44 +03:00
|
|
|
|
def test_invalid_encoding2(self) -> None:
|
2021-01-26 15:32:17 +03:00
|
|
|
|
"""A body which doesn't match the sent character encoding."""
|
|
|
|
|
# Note that this contains an invalid UTF-8 sequence in the title.
|
|
|
|
|
html = b"""
|
|
|
|
|
<html>
|
|
|
|
|
<head><title>\xff\xff Foo</title></head>
|
|
|
|
|
<body>
|
|
|
|
|
Some text.
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
"""
|
2021-10-14 17:17:20 +03:00
|
|
|
|
tree = decode_body(html, "http://example.com/test.html")
|
2023-05-31 20:06:57 +03:00
|
|
|
|
assert tree is not None
|
2022-03-16 14:21:36 +03:00
|
|
|
|
og = parse_html_to_open_graph(tree)
|
2021-01-26 15:32:17 +03:00
|
|
|
|
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
|
2021-02-08 20:33:30 +03:00
|
|
|
|
|
2022-03-03 19:05:44 +03:00
|
|
|
|
def test_windows_1252(self) -> None:
|
2021-10-14 21:51:44 +03:00
|
|
|
|
"""A body which uses cp1252, but doesn't declare that."""
|
2021-10-14 17:17:20 +03:00
|
|
|
|
html = b"""
|
|
|
|
|
<html>
|
|
|
|
|
<head><title>\xf3</title></head>
|
|
|
|
|
<body>
|
|
|
|
|
Some text.
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
"""
|
|
|
|
|
tree = decode_body(html, "http://example.com/test.html")
|
2023-05-31 20:06:57 +03:00
|
|
|
|
assert tree is not None
|
2022-03-16 14:21:36 +03:00
|
|
|
|
og = parse_html_to_open_graph(tree)
|
2021-10-14 17:17:20 +03:00
|
|
|
|
self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})
|
|
|
|
|
|
2022-06-16 14:41:57 +03:00
|
|
|
|
def test_twitter_tag(self) -> None:
|
|
|
|
|
"""Twitter card tags should be used if nothing else is available."""
|
|
|
|
|
html = b"""
|
|
|
|
|
<html>
|
|
|
|
|
<meta name="twitter:card" content="summary">
|
|
|
|
|
<meta name="twitter:description" content="Description">
|
|
|
|
|
<meta name="twitter:site" content="@matrixdotorg">
|
|
|
|
|
</html>
|
|
|
|
|
"""
|
|
|
|
|
tree = decode_body(html, "http://example.com/test.html")
|
2023-05-31 20:06:57 +03:00
|
|
|
|
assert tree is not None
|
2022-06-16 14:41:57 +03:00
|
|
|
|
og = parse_html_to_open_graph(tree)
|
|
|
|
|
self.assertEqual(
|
|
|
|
|
og,
|
|
|
|
|
{
|
|
|
|
|
"og:title": None,
|
|
|
|
|
"og:description": "Description",
|
|
|
|
|
"og:site_name": "@matrixdotorg",
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# But they shouldn't override Open Graph values.
|
|
|
|
|
html = b"""
|
|
|
|
|
<html>
|
|
|
|
|
<meta name="twitter:card" content="summary">
|
|
|
|
|
<meta name="twitter:description" content="Description">
|
|
|
|
|
<meta property="og:description" content="Real Description">
|
|
|
|
|
<meta name="twitter:site" content="@matrixdotorg">
|
|
|
|
|
<meta property="og:site_name" content="matrix.org">
|
|
|
|
|
</html>
|
|
|
|
|
"""
|
|
|
|
|
tree = decode_body(html, "http://example.com/test.html")
|
2023-05-31 20:06:57 +03:00
|
|
|
|
assert tree is not None
|
2022-06-16 14:41:57 +03:00
|
|
|
|
og = parse_html_to_open_graph(tree)
|
|
|
|
|
self.assertEqual(
|
|
|
|
|
og,
|
|
|
|
|
{
|
|
|
|
|
"og:title": None,
|
|
|
|
|
"og:description": "Real Description",
|
|
|
|
|
"og:site_name": "matrix.org",
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
|
2022-06-28 16:29:08 +03:00
|
|
|
|
def test_nested_nodes(self) -> None:
|
|
|
|
|
"""A body with some nested nodes. Tests that we iterate over children
|
|
|
|
|
in the right order (and don't reverse the order of the text)."""
|
|
|
|
|
html = b"""
|
|
|
|
|
<a href="somewhere">Welcome <b>the bold <u>and underlined text <svg>
|
|
|
|
|
with a cheeky SVG</svg></u> and <strong>some</strong> tail text</b></a>
|
|
|
|
|
"""
|
|
|
|
|
tree = decode_body(html, "http://example.com/test.html")
|
2023-05-31 20:06:57 +03:00
|
|
|
|
assert tree is not None
|
2022-06-28 16:29:08 +03:00
|
|
|
|
og = parse_html_to_open_graph(tree)
|
|
|
|
|
self.assertEqual(
|
|
|
|
|
og,
|
|
|
|
|
{
|
|
|
|
|
"og:title": None,
|
|
|
|
|
"og:description": "Welcome\n\nthe bold\n\nand underlined text\n\nand\n\nsome\n\ntail text",
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
|
2021-02-08 20:33:30 +03:00
|
|
|
|
|
|
|
|
|
class MediaEncodingTestCase(unittest.TestCase):
|
2022-03-03 19:05:44 +03:00
|
|
|
|
def test_meta_charset(self) -> None:
|
2021-02-08 20:33:30 +03:00
|
|
|
|
"""A character encoding is found via the meta tag."""
|
2021-12-13 20:55:07 +03:00
|
|
|
|
encodings = _get_html_media_encodings(
|
2021-02-08 20:33:30 +03:00
|
|
|
|
b"""
|
|
|
|
|
<html>
|
|
|
|
|
<head><meta charset="ascii">
|
|
|
|
|
</head>
|
|
|
|
|
</html>
|
|
|
|
|
""",
|
|
|
|
|
"text/html",
|
|
|
|
|
)
|
2021-10-14 21:51:44 +03:00
|
|
|
|
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
|
2021-02-08 20:33:30 +03:00
|
|
|
|
|
|
|
|
|
# A less well-formed version.
|
2021-12-13 20:55:07 +03:00
|
|
|
|
encodings = _get_html_media_encodings(
|
2021-02-08 20:33:30 +03:00
|
|
|
|
b"""
|
|
|
|
|
<html>
|
|
|
|
|
<head>< meta charset = ascii>
|
|
|
|
|
</head>
|
|
|
|
|
</html>
|
|
|
|
|
""",
|
|
|
|
|
"text/html",
|
|
|
|
|
)
|
2021-10-14 21:51:44 +03:00
|
|
|
|
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
|
2021-02-08 20:33:30 +03:00
|
|
|
|
|
2022-03-03 19:05:44 +03:00
|
|
|
|
def test_meta_charset_underscores(self) -> None:
|
2021-07-27 20:29:42 +03:00
|
|
|
|
"""A character encoding contains underscore."""
|
2021-12-13 20:55:07 +03:00
|
|
|
|
encodings = _get_html_media_encodings(
|
2021-07-27 20:29:42 +03:00
|
|
|
|
b"""
|
|
|
|
|
<html>
|
|
|
|
|
<head><meta charset="Shift_JIS">
|
|
|
|
|
</head>
|
|
|
|
|
</html>
|
|
|
|
|
""",
|
|
|
|
|
"text/html",
|
|
|
|
|
)
|
2021-10-14 21:51:44 +03:00
|
|
|
|
self.assertEqual(list(encodings), ["shift_jis", "utf-8", "cp1252"])
|
2021-07-27 20:29:42 +03:00
|
|
|
|
|
2022-03-03 19:05:44 +03:00
|
|
|
|
def test_xml_encoding(self) -> None:
|
2021-02-08 20:33:30 +03:00
|
|
|
|
"""A character encoding is found via the meta tag."""
|
2021-12-13 20:55:07 +03:00
|
|
|
|
encodings = _get_html_media_encodings(
|
2021-02-08 20:33:30 +03:00
|
|
|
|
b"""
|
|
|
|
|
<?xml version="1.0" encoding="ascii"?>
|
|
|
|
|
<html>
|
|
|
|
|
</html>
|
|
|
|
|
""",
|
|
|
|
|
"text/html",
|
|
|
|
|
)
|
2021-10-14 21:51:44 +03:00
|
|
|
|
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
|
2021-02-08 20:33:30 +03:00
|
|
|
|
|
2022-03-03 19:05:44 +03:00
|
|
|
|
def test_meta_xml_encoding(self) -> None:
|
2021-02-08 20:33:30 +03:00
|
|
|
|
"""Meta tags take precedence over XML encoding."""
|
2021-12-13 20:55:07 +03:00
|
|
|
|
encodings = _get_html_media_encodings(
|
2021-02-08 20:33:30 +03:00
|
|
|
|
b"""
|
|
|
|
|
<?xml version="1.0" encoding="ascii"?>
|
|
|
|
|
<html>
|
|
|
|
|
<head><meta charset="UTF-16">
|
|
|
|
|
</head>
|
|
|
|
|
</html>
|
|
|
|
|
""",
|
|
|
|
|
"text/html",
|
|
|
|
|
)
|
2021-10-14 21:51:44 +03:00
|
|
|
|
self.assertEqual(list(encodings), ["utf-16", "ascii", "utf-8", "cp1252"])
|
2021-02-08 20:33:30 +03:00
|
|
|
|
|
2022-03-03 19:05:44 +03:00
|
|
|
|
def test_content_type(self) -> None:
|
2021-02-08 20:33:30 +03:00
|
|
|
|
"""A character encoding is found via the Content-Type header."""
|
|
|
|
|
# Test a few variations of the header.
|
|
|
|
|
headers = (
|
|
|
|
|
'text/html; charset="ascii";',
|
|
|
|
|
"text/html;charset=ascii;",
|
|
|
|
|
'text/html; charset="ascii"',
|
|
|
|
|
"text/html; charset=ascii",
|
|
|
|
|
'text/html; charset="ascii;',
|
|
|
|
|
'text/html; charset=ascii";',
|
|
|
|
|
)
|
|
|
|
|
for header in headers:
|
2021-12-13 20:55:07 +03:00
|
|
|
|
encodings = _get_html_media_encodings(b"", header)
|
2021-10-14 21:51:44 +03:00
|
|
|
|
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
|
2021-02-08 20:33:30 +03:00
|
|
|
|
|
2022-03-03 19:05:44 +03:00
|
|
|
|
def test_fallback(self) -> None:
|
2021-02-08 20:33:30 +03:00
|
|
|
|
"""A character encoding cannot be found in the body or header."""
|
2021-12-13 20:55:07 +03:00
|
|
|
|
encodings = _get_html_media_encodings(b"", "text/html")
|
2021-10-14 21:51:44 +03:00
|
|
|
|
self.assertEqual(list(encodings), ["utf-8", "cp1252"])
|
|
|
|
|
|
2022-03-03 19:05:44 +03:00
|
|
|
|
def test_duplicates(self) -> None:
|
2021-10-14 21:51:44 +03:00
|
|
|
|
"""Ensure each encoding is only attempted once."""
|
2021-12-13 20:55:07 +03:00
|
|
|
|
encodings = _get_html_media_encodings(
|
2021-10-14 21:51:44 +03:00
|
|
|
|
b"""
|
|
|
|
|
<?xml version="1.0" encoding="utf8"?>
|
|
|
|
|
<html>
|
|
|
|
|
<head><meta charset="UTF-8">
|
|
|
|
|
</head>
|
|
|
|
|
</html>
|
|
|
|
|
""",
|
|
|
|
|
'text/html; charset="UTF_8"',
|
|
|
|
|
)
|
|
|
|
|
self.assertEqual(list(encodings), ["utf-8", "cp1252"])
|
|
|
|
|
|
2022-03-03 19:05:44 +03:00
|
|
|
|
def test_unknown_invalid(self) -> None:
|
2021-10-14 21:51:44 +03:00
|
|
|
|
"""A character encoding should be ignored if it is unknown or invalid."""
|
2021-12-13 20:55:07 +03:00
|
|
|
|
encodings = _get_html_media_encodings(
|
2021-10-14 21:51:44 +03:00
|
|
|
|
b"""
|
|
|
|
|
<html>
|
|
|
|
|
<head><meta charset="invalid">
|
|
|
|
|
</head>
|
|
|
|
|
</html>
|
|
|
|
|
""",
|
|
|
|
|
'text/html; charset="invalid"',
|
|
|
|
|
)
|
|
|
|
|
self.assertEqual(list(encodings), ["utf-8", "cp1252"])
|