Coverage for an_website / quotes / info.py: 82.353%

85 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-31 10:27 +0000

1# This program is free software: you can redistribute it and/or modify 

2# it under the terms of the GNU Affero General Public License as 

3# published by the Free Software Foundation, either version 3 of the 

4# License, or (at your option) any later version. 

5# 

6# This program is distributed in the hope that it will be useful, 

7# but WITHOUT ANY WARRANTY; without even the implied warranty of 

8# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

9# GNU Affero General Public License for more details. 

10# 

11# You should have received a copy of the GNU Affero General Public License 

12# along with this program. If not, see <https://www.gnu.org/licenses/>. 

13 

14"""Info page to show information about authors and quotes.""" 

15 

16 

17import logging 

18from datetime import datetime, timedelta, timezone 

19from typing import Final, cast 

20from urllib.parse import quote as quote_url 

21 

22import orjson as json 

23import regex 

24from tornado.httpclient import AsyncHTTPClient 

25from tornado.web import HTTPError 

26 

27from .. import CA_BUNDLE_PATH, EVENT_REDIS 

28from ..utils.request_handler import HTMLRequestHandler 

29from .utils import get_author_by_id, get_quote_by_id, get_wrong_quotes 

30 

31LOGGER: Final = logging.getLogger(__name__) 

32 

33 

34class QuotesInfoPage(HTMLRequestHandler): 

35 """The request handler used for the info page.""" 

36 

37 RATELIMIT_GET_LIMIT = 30 

38 

39 async def get(self, id_str: str, *, head: bool = False) -> None: 

40 """Handle GET requests to the quote info page.""" 

41 quote_id: int = int(id_str) 

42 quote = await get_quote_by_id(quote_id) 

43 if quote is None: 

44 raise HTTPError(404) 

45 if head: 

46 return 

47 wqs = get_wrong_quotes(lambda wq: wq.quote_id == quote_id, sort=True) 

48 await self.render( 

49 "pages/quotes/quote_info.html", 

50 quote=quote, 

51 wrong_quotes=wqs, 

52 title="Zitat-Informationen", 

53 short_title="Zitat-Info", 

54 type="Zitat", 

55 id=quote.id, 

56 text=str(quote), 

57 description=f"Falsch zugeordnete Zitate mit „{quote}“ als Zitat.", 

58 create_kwargs={"quote": quote.id}, 

59 ) 

60 

61 

62WIKI_API_DE: Final[str] = "https://de.wikipedia.org/w/api.php" 

63WIKI_API_EN: Final[str] = "https://en.wikipedia.org/w/api.php" 

64 

65 

66async def search_wikipedia( 

67 query: str, api: str = WIKI_API_DE 

68) -> None | tuple[str, None | str, datetime]: 

69 """ 

70 Search Wikipedia to get information about the query. 

71 

72 Return a tuple with the URL and the content. 

73 """ 

74 if not query: 

75 return None 

76 # try to get the info from Wikipedia 

77 response = await AsyncHTTPClient().fetch( 

78 ( 

79 f"{api}?action=opensearch&namespace=0&profile=normal&" 

80 f"search={quote_url(query)}&limit=1&redirects=resolve&format=json" 

81 ), 

82 ca_certs=CA_BUNDLE_PATH, 

83 ) 

84 response_json = json.loads(response.body) 

85 if not response_json[1]: 

86 if api == WIKI_API_DE: 

87 return await search_wikipedia(query, WIKI_API_EN) 

88 return None # nothing found 

89 page_name = response_json[1][0] 

90 # get the URL of the content & replace "," with "%2C" 

91 url = str(response_json[3][0]).replace(",", "%2C") 

92 

93 return ( 

94 url, 

95 await get_wikipedia_page_content(page_name, api), 

96 datetime.now(timezone.utc), 

97 ) 

98 

99 

100async def get_wikipedia_page_content( 

101 page_name: str, api: str = WIKI_API_DE 

102) -> None | str: 

103 """Get content from a Wikipedia page and return it.""" 

104 response = await AsyncHTTPClient().fetch( 

105 ( 

106 f"{api}?action=query&prop=extracts&exsectionformat=plain&exintro&" 

107 f"titles={quote_url(page_name)}&explaintext&format=json&exsentences=5" 

108 ), 

109 ca_certs=CA_BUNDLE_PATH, 

110 ) 

111 response_json = json.loads(response.body) 

112 if "query" not in response_json or "pages" not in response_json["query"]: 

113 return None 

114 pages: dict[str, str] = response_json["query"]["pages"] 

115 page = cast(dict[str, str], tuple(pages.values())[0]) 

116 if "extract" not in page: 

117 return None 

118 return page["extract"] 

119 

120 

121def fix_author_for_wikipedia_search(author: str) -> str: 

122 """ 

123 Fix author for Wikipedia search. 

124 

125 This tries to reduce common problems with authors. 

126 So that we can show more information. 

127 """ 

128 author = regex.sub(r"\s+", " ", author) 

129 author = regex.sub(r"\s*\(.*\)", "", author) 

130 author = regex.sub(r"\s*Werbespruch$", "", author, regex.IGNORECASE) 

131 author = regex.sub(r"\s*Werbung$", "", author, regex.IGNORECASE) 

132 author = regex.sub(r"^nach\s*", "", author, regex.IGNORECASE) 

133 author = regex.sub(r"^Ein\s+", "", author, regex.IGNORECASE) 

134 return author 

135 

136 

137# time to live in seconds (1 month) 

138AUTHOR_INFO_NEW_TTL: Final[int] = 60 * 60 * 24 * 30 

139 

140 

141class AuthorsInfoPage(HTMLRequestHandler): 

142 """The request handler used for the info page.""" 

143 

144 RATELIMIT_GET_LIMIT = 5 

145 

146 async def get(self, id_str: str, *, head: bool = False) -> None: 

147 """Handle GET requests to the author info page.""" 

148 author_id: int = int(id_str) 

149 author = await get_author_by_id(author_id) 

150 if author is None: 

151 raise HTTPError(404) 

152 if head: 

153 return 

154 if author.info is None: 

155 result = None 

156 fixed_author_name = fix_author_for_wikipedia_search(author.name) 

157 if EVENT_REDIS.is_set(): 

158 # try to get the info from Redis 

159 result = await self.redis.get( 

160 self.get_redis_info_key(fixed_author_name) 

161 ) 

162 if result and (len(info := result.split("|", maxsplit=1)) > 1): 

163 remaining_ttl = await self.redis.ttl( 

164 self.get_redis_info_key(fixed_author_name) 

165 ) 

166 creation_date = datetime.now(tz=timezone.utc) - timedelta( 

167 seconds=AUTHOR_INFO_NEW_TTL - remaining_ttl 

168 ) 

169 if len(info) == 1: 

170 author.info = (info[0], None, creation_date) 

171 else: 

172 author.info = (info[0], info[1], creation_date) 

173 else: 

174 author.info = await search_wikipedia(fixed_author_name) 

175 if author.info is None or author.info[1] is None: 

176 # nothing found 

177 LOGGER.info("No information found about %s", repr(author)) 

178 elif EVENT_REDIS.is_set(): 

179 await self.redis.setex( 

180 self.get_redis_info_key(fixed_author_name), 

181 AUTHOR_INFO_NEW_TTL, 

182 # value to save (the author info) 

183 # type is ignored, because author.info[1] is not None 

184 "|".join(author.info[0:2]), # type: ignore[arg-type] 

185 ) 

186 

187 wqs = get_wrong_quotes( 

188 lambda wq: wq.author_id == author_id, 

189 sort=True, 

190 ) 

191 

192 await self.render( 

193 "pages/quotes/author_info.html", 

194 author=author, 

195 wrong_quotes=wqs, 

196 title="Autor-Informationen", 

197 short_title="Autor-Info", 

198 type="Autor", 

199 id=author_id, 

200 text=str(author), 

201 description=f"Falsch zugeordnete Zitate mit „{author}“ als Autor.", 

202 create_kwargs={"author": author_id}, 

203 ) 

204 

205 def get_redis_info_key(self, author_name: str) -> str: 

206 """Get the key to save the author info with Redis.""" 

207 return f"{self.redis_prefix}:quote-author-info:{author_name}"