Coverage for an_website / quotes / info.py: 82.558%

86 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-04 20:05 +0000

1# This program is free software: you can redistribute it and/or modify 

2# it under the terms of the GNU Affero General Public License as 

3# published by the Free Software Foundation, either version 3 of the 

4# License, or (at your option) any later version. 

5# 

6# This program is distributed in the hope that it will be useful, 

7# but WITHOUT ANY WARRANTY; without even the implied warranty of 

8# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

9# GNU Affero General Public License for more details. 

10# 

11# You should have received a copy of the GNU Affero General Public License 

12# along with this program. If not, see <https://www.gnu.org/licenses/>. 

13 

14"""Info page to show information about authors and quotes.""" 

15 

16from __future__ import annotations 

17 

18import logging 

19from datetime import datetime, timedelta, timezone 

20from typing import Final, cast 

21from urllib.parse import quote as quote_url 

22 

23import orjson as json 

24import regex 

25from tornado.httpclient import AsyncHTTPClient 

26from tornado.web import HTTPError 

27 

28from .. import CA_BUNDLE_PATH, EVENT_REDIS 

29from ..utils.request_handler import HTMLRequestHandler 

30from .utils import get_author_by_id, get_quote_by_id, get_wrong_quotes 

31 

32LOGGER: Final = logging.getLogger(__name__) 

33 

34 

35class QuotesInfoPage(HTMLRequestHandler): 

36 """The request handler used for the info page.""" 

37 

38 RATELIMIT_GET_LIMIT = 30 

39 

40 async def get(self, id_str: str, *, head: bool = False) -> None: 

41 """Handle GET requests to the quote info page.""" 

42 quote_id: int = int(id_str) 

43 quote = await get_quote_by_id(quote_id) 

44 if quote is None: 

45 raise HTTPError(404) 

46 if head: 

47 return 

48 wqs = get_wrong_quotes(lambda wq: wq.quote_id == quote_id, sort=True) 

49 await self.render( 

50 "pages/quotes/quote_info.html", 

51 quote=quote, 

52 wrong_quotes=wqs, 

53 title="Zitat-Informationen", 

54 short_title="Zitat-Info", 

55 type="Zitat", 

56 id=quote.id, 

57 text=str(quote), 

58 description=f"Falsch zugeordnete Zitate mit „{quote}“ als Zitat.", 

59 create_kwargs={"quote": quote.id}, 

60 ) 

61 

62 

63WIKI_API_DE: Final[str] = "https://de.wikipedia.org/w/api.php" 

64WIKI_API_EN: Final[str] = "https://en.wikipedia.org/w/api.php" 

65 

66 

67async def search_wikipedia( 

68 query: str, api: str = WIKI_API_DE 

69) -> None | tuple[str, None | str, datetime]: 

70 """ 

71 Search Wikipedia to get information about the query. 

72 

73 Return a tuple with the URL and the content. 

74 """ 

75 if not query: 

76 return None 

77 # try to get the info from Wikipedia 

78 response = await AsyncHTTPClient().fetch( 

79 ( 

80 f"{api}?action=opensearch&namespace=0&profile=normal&" 

81 f"search={quote_url(query)}&limit=1&redirects=resolve&format=json" 

82 ), 

83 ca_certs=CA_BUNDLE_PATH, 

84 ) 

85 response_json = json.loads(response.body) 

86 if not response_json[1]: 

87 if api == WIKI_API_DE: 

88 return await search_wikipedia(query, WIKI_API_EN) 

89 return None # nothing found 

90 page_name = response_json[1][0] 

91 # get the URL of the content & replace "," with "%2C" 

92 url = str(response_json[3][0]).replace(",", "%2C") 

93 

94 return ( 

95 url, 

96 await get_wikipedia_page_content(page_name, api), 

97 datetime.now(timezone.utc), 

98 ) 

99 

100 

101async def get_wikipedia_page_content( 

102 page_name: str, api: str = WIKI_API_DE 

103) -> None | str: 

104 """Get content from a Wikipedia page and return it.""" 

105 response = await AsyncHTTPClient().fetch( 

106 ( 

107 f"{api}?action=query&prop=extracts&exsectionformat=plain&exintro&" 

108 f"titles={quote_url(page_name)}&explaintext&format=json&exsentences=5" 

109 ), 

110 ca_certs=CA_BUNDLE_PATH, 

111 ) 

112 response_json = json.loads(response.body) 

113 if "query" not in response_json or "pages" not in response_json["query"]: 

114 return None 

115 pages: dict[str, str] = response_json["query"]["pages"] 

116 page = cast(dict[str, str], tuple(pages.values())[0]) 

117 if "extract" not in page: 

118 return None 

119 return page["extract"] 

120 

121 

122def fix_author_for_wikipedia_search(author: str) -> str: 

123 """ 

124 Fix author for Wikipedia search. 

125 

126 This tries to reduce common problems with authors. 

127 So that we can show more information. 

128 """ 

129 author = regex.sub(r"\s+", " ", author) 

130 author = regex.sub(r"\s*\(.*\)", "", author) 

131 author = regex.sub(r"\s*Werbespruch$", "", author, regex.IGNORECASE) 

132 author = regex.sub(r"\s*Werbung$", "", author, regex.IGNORECASE) 

133 author = regex.sub(r"^nach\s*", "", author, regex.IGNORECASE) 

134 author = regex.sub(r"^Ein\s+", "", author, regex.IGNORECASE) 

135 return author 

136 

137 

138# time to live in seconds (1 month) 

139AUTHOR_INFO_NEW_TTL: Final[int] = 60 * 60 * 24 * 30 

140 

141 

142class AuthorsInfoPage(HTMLRequestHandler): 

143 """The request handler used for the info page.""" 

144 

145 RATELIMIT_GET_LIMIT = 5 

146 

147 async def get(self, id_str: str, *, head: bool = False) -> None: 

148 """Handle GET requests to the author info page.""" 

149 author_id: int = int(id_str) 

150 author = await get_author_by_id(author_id) 

151 if author is None: 

152 raise HTTPError(404) 

153 if head: 

154 return 

155 if author.info is None: 

156 result = None 

157 fixed_author_name = fix_author_for_wikipedia_search(author.name) 

158 if EVENT_REDIS.is_set(): 

159 # try to get the info from Redis 

160 result = await self.redis.get( 

161 self.get_redis_info_key(fixed_author_name) 

162 ) 

163 if result and (len(info := result.split("|", maxsplit=1)) > 1): 

164 remaining_ttl = await self.redis.ttl( 

165 self.get_redis_info_key(fixed_author_name) 

166 ) 

167 creation_date = datetime.now(tz=timezone.utc) - timedelta( 

168 seconds=AUTHOR_INFO_NEW_TTL - remaining_ttl 

169 ) 

170 if len(info) == 1: 

171 author.info = (info[0], None, creation_date) 

172 else: 

173 author.info = (info[0], info[1], creation_date) 

174 else: 

175 author.info = await search_wikipedia(fixed_author_name) 

176 if author.info is None or author.info[1] is None: 

177 # nothing found 

178 LOGGER.info("No information found about %s", repr(author)) 

179 elif EVENT_REDIS.is_set(): 

180 await self.redis.setex( 

181 self.get_redis_info_key(fixed_author_name), 

182 AUTHOR_INFO_NEW_TTL, 

183 # value to save (the author info) 

184 # type is ignored, because author.info[1] is not None 

185 "|".join(author.info[0:2]), # type: ignore[arg-type] 

186 ) 

187 

188 wqs = get_wrong_quotes( 

189 lambda wq: wq.author_id == author_id, 

190 sort=True, 

191 ) 

192 

193 await self.render( 

194 "pages/quotes/author_info.html", 

195 author=author, 

196 wrong_quotes=wqs, 

197 title="Autor-Informationen", 

198 short_title="Autor-Info", 

199 type="Autor", 

200 id=author_id, 

201 text=str(author), 

202 description=f"Falsch zugeordnete Zitate mit „{author}“ als Autor.", 

203 create_kwargs={"author": author_id}, 

204 ) 

205 

206 def get_redis_info_key(self, author_name: str) -> str: 

207 """Get the key to save the author info with Redis.""" 

208 return f"{self.redis_prefix}:quote-author-info:{author_name}"