Coverage for an_website/quotes/info.py: 83.951%

81 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-16 19:56 +0000

1# This program is free software: you can redistribute it and/or modify 

2# it under the terms of the GNU Affero General Public License as 

3# published by the Free Software Foundation, either version 3 of the 

4# License, or (at your option) any later version. 

5# 

6# This program is distributed in the hope that it will be useful, 

7# but WITHOUT ANY WARRANTY; without even the implied warranty of 

8# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

9# GNU Affero General Public License for more details. 

10# 

11# You should have received a copy of the GNU Affero General Public License 

12# along with this program. If not, see <https://www.gnu.org/licenses/>. 

13 

14"""Info page to show information about authors and quotes.""" 

15 

16from __future__ import annotations 

17 

18import logging 

19from datetime import datetime, timedelta, timezone 

20from typing import Final, cast 

21from urllib.parse import quote as quote_url 

22 

23import orjson as json 

24import regex 

25from tornado.httpclient import AsyncHTTPClient 

26 

27from .. import CA_BUNDLE_PATH, EVENT_REDIS 

28from ..utils.request_handler import HTMLRequestHandler 

29from .utils import get_author_by_id, get_quote_by_id, get_wrong_quotes 

30 

31LOGGER: Final = logging.getLogger(__name__) 

32 

33 

34class QuotesInfoPage(HTMLRequestHandler): 

35 """The request handler used for the info page.""" 

36 

37 RATELIMIT_GET_LIMIT = 30 

38 

39 async def get(self, id_str: str, *, head: bool = False) -> None: 

40 """Handle GET requests to the quote info page.""" 

41 quote_id: int = int(id_str) 

42 quote = await get_quote_by_id(quote_id) 

43 if head: 

44 return 

45 wqs = get_wrong_quotes(lambda wq: wq.quote_id == quote_id, sort=True) 

46 await self.render( 

47 "pages/quotes/quote_info.html", 

48 quote=quote, 

49 wrong_quotes=wqs, 

50 title="Zitat-Informationen", 

51 short_title="Zitat-Info", 

52 type="Zitat", 

53 id=quote_id, 

54 text=str(quote), 

55 description=f"Falsch zugeordnete Zitate mit „{quote}“ als Zitat.", 

56 create_kwargs={"quote": quote_id}, 

57 ) 

58 

59 

60WIKI_API_DE: Final[str] = "https://de.wikipedia.org/w/api.php" 

61WIKI_API_EN: Final[str] = "https://en.wikipedia.org/w/api.php" 

62 

63 

64async def search_wikipedia( 

65 query: str, api: str = WIKI_API_DE 

66) -> None | tuple[str, None | str, datetime]: 

67 """ 

68 Search Wikipedia to get information about the query. 

69 

70 Return a tuple with the URL and the content. 

71 """ 

72 if not query: 

73 return None 

74 # try to get the info from Wikipedia 

75 response = await AsyncHTTPClient().fetch( 

76 ( 

77 f"{api}?action=opensearch&namespace=0&profile=normal&" 

78 f"search={quote_url(query)}&limit=1&redirects=resolve&format=json" 

79 ), 

80 ca_certs=CA_BUNDLE_PATH, 

81 ) 

82 response_json = json.loads(response.body) 

83 if not response_json[1]: 

84 if api == WIKI_API_DE: 

85 return await search_wikipedia(query, WIKI_API_EN) 

86 return None # nothing found 

87 page_name = response_json[1][0] 

88 # get the URL of the content & replace "," with "%2C" 

89 url = str(response_json[3][0]).replace(",", "%2C") 

90 

91 return ( 

92 url, 

93 await get_wikipedia_page_content(page_name, api), 

94 datetime.now(timezone.utc), 

95 ) 

96 

97 

98async def get_wikipedia_page_content( 

99 page_name: str, api: str = WIKI_API_DE 

100) -> None | str: 

101 """Get content from a Wikipedia page and return it.""" 

102 response = await AsyncHTTPClient().fetch( 

103 ( 

104 f"{api}?action=query&prop=extracts&exsectionformat=plain&exintro&" 

105 f"titles={quote_url(page_name)}&explaintext&format=json&exsentences=5" 

106 ), 

107 ca_certs=CA_BUNDLE_PATH, 

108 ) 

109 response_json = json.loads(response.body) 

110 if "query" not in response_json or "pages" not in response_json["query"]: 

111 return None 

112 pages: dict[str, str] = response_json["query"]["pages"] 

113 page = cast(dict[str, str], tuple(pages.values())[0]) 

114 if "extract" not in page: 

115 return None 

116 return page["extract"] 

117 

118 

119def fix_author_for_wikipedia_search(author: str) -> str: 

120 """ 

121 Fix author for Wikipedia search. 

122 

123 This tries to reduce common problems with authors. 

124 So that we can show more information. 

125 """ 

126 author = regex.sub(r"\s+", " ", author) 

127 author = regex.sub(r"\s*\(.*\)", "", author) 

128 author = regex.sub(r"\s*Werbespruch$", "", author, regex.IGNORECASE) 

129 author = regex.sub(r"\s*Werbung$", "", author, regex.IGNORECASE) 

130 author = regex.sub(r"^nach\s*", "", author, regex.IGNORECASE) 

131 author = regex.sub(r"^Ein\s+", "", author, regex.IGNORECASE) 

132 return author 

133 

134 

135# time to live in seconds (1 month) 

136AUTHOR_INFO_NEW_TTL: Final[int] = 60 * 60 * 24 * 30 

137 

138 

139class AuthorsInfoPage(HTMLRequestHandler): 

140 """The request handler used for the info page.""" 

141 

142 RATELIMIT_GET_LIMIT = 5 

143 

144 async def get(self, id_str: str, *, head: bool = False) -> None: 

145 """Handle GET requests to the author info page.""" 

146 author_id: int = int(id_str) 

147 author = await get_author_by_id(author_id) 

148 if head: 

149 return 

150 if author.info is None: 

151 result = None 

152 fixed_author_name = fix_author_for_wikipedia_search(author.name) 

153 if EVENT_REDIS.is_set(): 

154 # try to get the info from Redis 

155 result = await self.redis.get( 

156 self.get_redis_info_key(fixed_author_name) 

157 ) 

158 if result and (len(info := result.split("|", maxsplit=1)) > 1): 

159 remaining_ttl = await self.redis.ttl( 

160 self.get_redis_info_key(fixed_author_name) 

161 ) 

162 creation_date = datetime.now(tz=timezone.utc) - timedelta( 

163 seconds=AUTHOR_INFO_NEW_TTL - remaining_ttl 

164 ) 

165 if len(info) == 1: 

166 author.info = (info[0], None, creation_date) 

167 else: 

168 author.info = (info[0], info[1], creation_date) 

169 else: 

170 author.info = await search_wikipedia(fixed_author_name) 

171 if author.info is None or author.info[1] is None: 

172 # nothing found 

173 LOGGER.info("No information found about %s", repr(author)) 

174 elif EVENT_REDIS.is_set(): 

175 await self.redis.setex( 

176 self.get_redis_info_key(fixed_author_name), 

177 AUTHOR_INFO_NEW_TTL, 

178 # value to save (the author info) 

179 # type is ignored, because author.info[1] is not None 

180 "|".join(author.info[0:2]), # type: ignore[arg-type] 

181 ) 

182 

183 wqs = get_wrong_quotes( 

184 lambda wq: wq.author_id == author_id, 

185 sort=True, 

186 ) 

187 

188 await self.render( 

189 "pages/quotes/author_info.html", 

190 author=author, 

191 wrong_quotes=wqs, 

192 title="Autor-Informationen", 

193 short_title="Autor-Info", 

194 type="Autor", 

195 id=author_id, 

196 text=str(author), 

197 description=f"Falsch zugeordnete Zitate mit „{author}“ als Autor.", 

198 create_kwargs={"author": author_id}, 

199 ) 

200 

201 def get_redis_info_key(self, author_name: str) -> str: 

202 """Get the key to save the author info with Redis.""" 

203 return f"{self.redis_prefix}:quote-author-info:{author_name}"