Coverage for an_website/quotes/info.py: 80.682%

88 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-10 18:56 +0000

1# This program is free software: you can redistribute it and/or modify 

2# it under the terms of the GNU Affero General Public License as 

3# published by the Free Software Foundation, either version 3 of the 

4# License, or (at your option) any later version. 

5# 

6# This program is distributed in the hope that it will be useful, 

7# but WITHOUT ANY WARRANTY; without even the implied warranty of 

8# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

9# GNU Affero General Public License for more details. 

10# 

11# You should have received a copy of the GNU Affero General Public License 

12# along with this program. If not, see <https://www.gnu.org/licenses/>. 

13 

14"""Info page to show information about authors and quotes.""" 

15 

16import logging 

17from datetime import datetime, timedelta, timezone 

18from typing import Final, cast 

19from urllib.parse import quote as quote_url 

20 

21import orjson as json 

22import regex 

23from tornado.httpclient import AsyncHTTPClient, HTTPClientError 

24from tornado.web import HTTPError 

25 

26from .. import CA_BUNDLE_PATH, EVENT_REDIS 

27from ..utils.request_handler import HTMLRequestHandler 

28from .utils import get_author_by_id, get_quote_by_id, get_wrong_quotes 

29 

30LOGGER: Final = logging.getLogger(__name__) 

31 

32 

33class QuotesInfoPage(HTMLRequestHandler): 

34 """The request handler used for the info page.""" 

35 

36 RATELIMIT_GET_LIMIT = 30 

37 

38 async def get(self, id_str: str, *, head: bool = False) -> None: 

39 """Handle GET requests to the quote info page.""" 

40 quote_id: int = int(id_str) 

41 quote = await get_quote_by_id(quote_id) 

42 if quote is None: 

43 raise HTTPError(404) 

44 if head: 

45 return 

46 wqs = get_wrong_quotes(lambda wq: wq.quote_id == quote_id, sort=True) 

47 await self.render( 

48 "pages/quotes/quote_info.html", 

49 quote=quote, 

50 wrong_quotes=wqs, 

51 title="Zitat-Informationen", 

52 short_title="Zitat-Info", 

53 type="Zitat", 

54 id=quote.id, 

55 text=str(quote), 

56 description=f"Falsch zugeordnete Zitate mit „{quote}“ als Zitat.", 

57 create_kwargs={"quote": quote.id}, 

58 ) 

59 

60 

61WIKI_API_DE: Final[str] = "https://de.wikipedia.org/w/api.php" 

62WIKI_API_EN: Final[str] = "https://en.wikipedia.org/w/api.php" 

63 

64 

65async def search_wikipedia( 

66 query: str, api: str = WIKI_API_DE 

67) -> None | tuple[str, None | str, datetime]: 

68 """ 

69 Search Wikipedia to get information about the query. 

70 

71 Return a tuple with the URL and the content. 

72 """ 

73 if not query: 

74 return None 

75 # try to get the info from Wikipedia 

76 response = await AsyncHTTPClient().fetch( 

77 ( 

78 f"{api}?action=opensearch&namespace=0&profile=normal&" 

79 f"search={quote_url(query)}&limit=1&redirects=resolve&format=json" 

80 ), 

81 ca_certs=CA_BUNDLE_PATH, 

82 ) 

83 response_json = json.loads(response.body) 

84 if not response_json[1]: 

85 if api == WIKI_API_DE: 

86 return await search_wikipedia(query, WIKI_API_EN) 

87 return None # nothing found 

88 page_name = response_json[1][0] 

89 # get the URL of the content & replace "," with "%2C" 

90 url = str(response_json[3][0]).replace(",", "%2C") 

91 

92 return ( 

93 url, 

94 await get_wikipedia_page_content(page_name, api), 

95 datetime.now(timezone.utc), 

96 ) 

97 

98 

99async def get_wikipedia_page_content( 

100 page_name: str, api: str = WIKI_API_DE 

101) -> None | str: 

102 """Get content from a Wikipedia page and return it.""" 

103 response = await AsyncHTTPClient().fetch( 

104 ( 

105 f"{api}?action=query&prop=extracts&exsectionformat=plain&exintro&" 

106 f"titles={quote_url(page_name)}&explaintext&format=json&exsentences=5" 

107 ), 

108 ca_certs=CA_BUNDLE_PATH, 

109 ) 

110 response_json = json.loads(response.body) 

111 if "query" not in response_json or "pages" not in response_json["query"]: 

112 return None 

113 pages: dict[str, str] = response_json["query"]["pages"] 

114 page = cast(dict[str, str], tuple(pages.values())[0]) 

115 if "extract" not in page: 

116 return None 

117 return page["extract"] 

118 

119 

120def fix_author_for_wikipedia_search(author: str) -> str: 

121 """ 

122 Fix author for Wikipedia search. 

123 

124 This tries to reduce common problems with authors. 

125 So that we can show more information. 

126 """ 

127 author = regex.sub(r"\s+", " ", author) 

128 author = regex.sub(r"\s*\(.*\)", "", author) 

129 author = regex.sub(r"\s*Werbespruch$", "", author, regex.IGNORECASE) 

130 author = regex.sub(r"\s*Werbung$", "", author, regex.IGNORECASE) 

131 author = regex.sub(r"^nach\s*", "", author, regex.IGNORECASE) 

132 author = regex.sub(r"^Ein\s+", "", author, regex.IGNORECASE) 

133 return author 

134 

135 

136# time to live in seconds (1 month) 

137AUTHOR_INFO_NEW_TTL: Final[int] = 60 * 60 * 24 * 30 

138 

139 

140class AuthorsInfoPage(HTMLRequestHandler): 

141 """The request handler used for the info page.""" 

142 

143 RATELIMIT_GET_LIMIT = 5 

144 

145 async def get(self, id_str: str, *, head: bool = False) -> None: 

146 """Handle GET requests to the author info page.""" 

147 # pylint: disable=too-complex 

148 author_id: int = int(id_str) 

149 author = await get_author_by_id(author_id) 

150 if author is None: 

151 raise HTTPError(404) 

152 if head: 

153 return 

154 if author.info is None: 

155 result = None 

156 fixed_author_name = fix_author_for_wikipedia_search(author.name) 

157 if EVENT_REDIS.is_set(): 

158 # try to get the info from Redis 

159 result = await self.redis.get( 

160 self.get_redis_info_key(fixed_author_name) 

161 ) 

162 if result and (len(info := result.split("|", maxsplit=1)) > 1): 

163 remaining_ttl = await self.redis.ttl( 

164 self.get_redis_info_key(fixed_author_name) 

165 ) 

166 creation_date = datetime.now(tz=timezone.utc) - timedelta( 

167 seconds=AUTHOR_INFO_NEW_TTL - remaining_ttl 

168 ) 

169 if len(info) == 1: 

170 author.info = (info[0], None, creation_date) 

171 else: 

172 author.info = (info[0], info[1], creation_date) 

173 else: 

174 try: 

175 author.info = await search_wikipedia(fixed_author_name) 

176 except HTTPClientError as err: 

177 LOGGER.error(err, "Searching wikipedia failed") 

178 if author.info is None or author.info[1] is None: 

179 # nothing found 

180 LOGGER.info("No information found about %s", repr(author)) 

181 elif EVENT_REDIS.is_set(): 

182 await self.redis.setex( 

183 self.get_redis_info_key(fixed_author_name), 

184 AUTHOR_INFO_NEW_TTL, 

185 # value to save (the author info) 

186 # type is ignored, because author.info[1] is not None 

187 "|".join(author.info[0:2]), # type: ignore[arg-type] 

188 ) 

189 

190 wqs = get_wrong_quotes( 

191 lambda wq: wq.author_id == author_id, 

192 sort=True, 

193 ) 

194 

195 await self.render( 

196 "pages/quotes/author_info.html", 

197 author=author, 

198 wrong_quotes=wqs, 

199 title="Autor-Informationen", 

200 short_title="Autor-Info", 

201 type="Autor", 

202 id=author_id, 

203 text=str(author), 

204 description=f"Falsch zugeordnete Zitate mit „{author}“ als Autor.", 

205 create_kwargs={"author": author_id}, 

206 ) 

207 

208 def get_redis_info_key(self, author_name: str) -> str: 

209 """Get the key to save the author info with Redis.""" 

210 return f"{self.redis_prefix}:quote-author-info:{author_name}"