Coverage for an_website/quotes/info.py: 83.951%
81 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-16 19:56 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-16 19:56 +0000
1# This program is free software: you can redistribute it and/or modify
2# it under the terms of the GNU Affero General Public License as
3# published by the Free Software Foundation, either version 3 of the
4# License, or (at your option) any later version.
5#
6# This program is distributed in the hope that it will be useful,
7# but WITHOUT ANY WARRANTY; without even the implied warranty of
8# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9# GNU Affero General Public License for more details.
10#
11# You should have received a copy of the GNU Affero General Public License
12# along with this program. If not, see <https://www.gnu.org/licenses/>.
14"""Info page to show information about authors and quotes."""
16from __future__ import annotations
18import logging
19from datetime import datetime, timedelta, timezone
20from typing import Final, cast
21from urllib.parse import quote as quote_url
23import orjson as json
24import regex
25from tornado.httpclient import AsyncHTTPClient
27from .. import CA_BUNDLE_PATH, EVENT_REDIS
28from ..utils.request_handler import HTMLRequestHandler
29from .utils import get_author_by_id, get_quote_by_id, get_wrong_quotes
31LOGGER: Final = logging.getLogger(__name__)
34class QuotesInfoPage(HTMLRequestHandler):
35 """The request handler used for the info page."""
37 RATELIMIT_GET_LIMIT = 30
39 async def get(self, id_str: str, *, head: bool = False) -> None:
40 """Handle GET requests to the quote info page."""
41 quote_id: int = int(id_str)
42 quote = await get_quote_by_id(quote_id)
43 if head:
44 return
45 wqs = get_wrong_quotes(lambda wq: wq.quote_id == quote_id, sort=True)
46 await self.render(
47 "pages/quotes/quote_info.html",
48 quote=quote,
49 wrong_quotes=wqs,
50 title="Zitat-Informationen",
51 short_title="Zitat-Info",
52 type="Zitat",
53 id=quote_id,
54 text=str(quote),
55 description=f"Falsch zugeordnete Zitate mit „{quote}“ als Zitat.",
56 create_kwargs={"quote": quote_id},
57 )
60WIKI_API_DE: Final[str] = "https://de.wikipedia.org/w/api.php"
61WIKI_API_EN: Final[str] = "https://en.wikipedia.org/w/api.php"
64async def search_wikipedia(
65 query: str, api: str = WIKI_API_DE
66) -> None | tuple[str, None | str, datetime]:
67 """
68 Search Wikipedia to get information about the query.
70 Return a tuple with the URL and the content.
71 """
72 if not query:
73 return None
74 # try to get the info from Wikipedia
75 response = await AsyncHTTPClient().fetch(
76 (
77 f"{api}?action=opensearch&namespace=0&profile=normal&"
78 f"search={quote_url(query)}&limit=1&redirects=resolve&format=json"
79 ),
80 ca_certs=CA_BUNDLE_PATH,
81 )
82 response_json = json.loads(response.body)
83 if not response_json[1]:
84 if api == WIKI_API_DE:
85 return await search_wikipedia(query, WIKI_API_EN)
86 return None # nothing found
87 page_name = response_json[1][0]
88 # get the URL of the content & replace "," with "%2C"
89 url = str(response_json[3][0]).replace(",", "%2C")
91 return (
92 url,
93 await get_wikipedia_page_content(page_name, api),
94 datetime.now(timezone.utc),
95 )
98async def get_wikipedia_page_content(
99 page_name: str, api: str = WIKI_API_DE
100) -> None | str:
101 """Get content from a Wikipedia page and return it."""
102 response = await AsyncHTTPClient().fetch(
103 (
104 f"{api}?action=query&prop=extracts&exsectionformat=plain&exintro&"
105 f"titles={quote_url(page_name)}&explaintext&format=json&exsentences=5"
106 ),
107 ca_certs=CA_BUNDLE_PATH,
108 )
109 response_json = json.loads(response.body)
110 if "query" not in response_json or "pages" not in response_json["query"]:
111 return None
112 pages: dict[str, str] = response_json["query"]["pages"]
113 page = cast(dict[str, str], tuple(pages.values())[0])
114 if "extract" not in page:
115 return None
116 return page["extract"]
119def fix_author_for_wikipedia_search(author: str) -> str:
120 """
121 Fix author for Wikipedia search.
123 This tries to reduce common problems with authors.
124 So that we can show more information.
125 """
126 author = regex.sub(r"\s+", " ", author)
127 author = regex.sub(r"\s*\(.*\)", "", author)
128 author = regex.sub(r"\s*Werbespruch$", "", author, regex.IGNORECASE)
129 author = regex.sub(r"\s*Werbung$", "", author, regex.IGNORECASE)
130 author = regex.sub(r"^nach\s*", "", author, regex.IGNORECASE)
131 author = regex.sub(r"^Ein\s+", "", author, regex.IGNORECASE)
132 return author
135# time to live in seconds (1 month)
136AUTHOR_INFO_NEW_TTL: Final[int] = 60 * 60 * 24 * 30
139class AuthorsInfoPage(HTMLRequestHandler):
140 """The request handler used for the info page."""
142 RATELIMIT_GET_LIMIT = 5
144 async def get(self, id_str: str, *, head: bool = False) -> None:
145 """Handle GET requests to the author info page."""
146 author_id: int = int(id_str)
147 author = await get_author_by_id(author_id)
148 if head:
149 return
150 if author.info is None:
151 result = None
152 fixed_author_name = fix_author_for_wikipedia_search(author.name)
153 if EVENT_REDIS.is_set():
154 # try to get the info from Redis
155 result = await self.redis.get(
156 self.get_redis_info_key(fixed_author_name)
157 )
158 if result and (len(info := result.split("|", maxsplit=1)) > 1):
159 remaining_ttl = await self.redis.ttl(
160 self.get_redis_info_key(fixed_author_name)
161 )
162 creation_date = datetime.now(tz=timezone.utc) - timedelta(
163 seconds=AUTHOR_INFO_NEW_TTL - remaining_ttl
164 )
165 if len(info) == 1:
166 author.info = (info[0], None, creation_date)
167 else:
168 author.info = (info[0], info[1], creation_date)
169 else:
170 author.info = await search_wikipedia(fixed_author_name)
171 if author.info is None or author.info[1] is None:
172 # nothing found
173 LOGGER.info("No information found about %s", repr(author))
174 elif EVENT_REDIS.is_set():
175 await self.redis.setex(
176 self.get_redis_info_key(fixed_author_name),
177 AUTHOR_INFO_NEW_TTL,
178 # value to save (the author info)
179 # type is ignored, because author.info[1] is not None
180 "|".join(author.info[0:2]), # type: ignore[arg-type]
181 )
183 wqs = get_wrong_quotes(
184 lambda wq: wq.author_id == author_id,
185 sort=True,
186 )
188 await self.render(
189 "pages/quotes/author_info.html",
190 author=author,
191 wrong_quotes=wqs,
192 title="Autor-Informationen",
193 short_title="Autor-Info",
194 type="Autor",
195 id=author_id,
196 text=str(author),
197 description=f"Falsch zugeordnete Zitate mit „{author}“ als Autor.",
198 create_kwargs={"author": author_id},
199 )
201 def get_redis_info_key(self, author_name: str) -> str:
202 """Get the key to save the author info with Redis."""
203 return f"{self.redis_prefix}:quote-author-info:{author_name}"