Coverage for an_website/quotes/info.py: 83.951%

1# This program is free software: you can redistribute it and/or modify

2# it under the terms of the GNU Affero General Public License as

3# published by the Free Software Foundation, either version 3 of the

4# License, or (at your option) any later version.

6# This program is distributed in the hope that it will be useful,

7# but WITHOUT ANY WARRANTY; without even the implied warranty of

8# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

9# GNU Affero General Public License for more details.

10#

11# You should have received a copy of the GNU Affero General Public License

12# along with this program. If not, see <https://www.gnu.org/licenses/>.

14"""Info page to show information about authors and quotes."""

16from __future__ import annotations

18import logging

19from datetime import datetime, timedelta, timezone

20from typing import Final, cast

21from urllib.parse import quote as quote_url

23import orjson as json

24import regex

25from tornado.httpclient import AsyncHTTPClient

27from .. import CA_BUNDLE_PATH, EVENT_REDIS

28from ..utils.request_handler import HTMLRequestHandler

29from .utils import get_author_by_id, get_quote_by_id, get_wrong_quotes

31LOGGER: Final = logging.getLogger(__name__)

34class QuotesInfoPage(HTMLRequestHandler):

35 """The request handler used for the info page."""

37 RATELIMIT_GET_LIMIT = 30

39 async def get(self, id_str: str, *, head: bool = False) -> None:

40 """Handle GET requests to the quote info page."""

41 quote_id: int = int(id_str)

42 quote = await get_quote_by_id(quote_id)

43 if head:

44 return

45 wqs = get_wrong_quotes(lambda wq: wq.quote_id == quote_id, sort=True)

46 await self.render(

47 "pages/quotes/quote_info.html",

48 quote=quote,

49 wrong_quotes=wqs,

50 title="Zitat-Informationen",

51 short_title="Zitat-Info",

52 type="Zitat",

53 id=quote_id,

54 text=str(quote),

55 description=f"Falsch zugeordnete Zitate mit „{quote}“ als Zitat.",

56 create_kwargs={"quote": quote_id},

57 )

60WIKI_API_DE: Final[str] = "https://de.wikipedia.org/w/api.php"

61WIKI_API_EN: Final[str] = "https://en.wikipedia.org/w/api.php"

64async def search_wikipedia(

65 query: str, api: str = WIKI_API_DE

66) -> None | tuple[str, None | str, datetime]:

67 """

68 Search Wikipedia to get information about the query.

70 Return a tuple with the URL and the content.

71 """

72 if not query:

73 return None

74 # try to get the info from Wikipedia

75 response = await AsyncHTTPClient().fetch(

76 (

77 f"{api}?action=opensearch&namespace=0&profile=normal&"

78 f"search={quote_url(query)}&limit=1&redirects=resolve&format=json"

79 ),

80 ca_certs=CA_BUNDLE_PATH,

81 )

82 response_json = json.loads(response.body)

83 if not response_json[1]:

84 if api == WIKI_API_DE:

85 return await search_wikipedia(query, WIKI_API_EN)

86 return None # nothing found

87 page_name = response_json[1][0]

88 # get the URL of the content & replace "," with "%2C"

89 url = str(response_json[3][0]).replace(",", "%2C")

91 return (

92 url,

93 await get_wikipedia_page_content(page_name, api),

94 datetime.now(timezone.utc),

95 )

98async def get_wikipedia_page_content(

99 page_name: str, api: str = WIKI_API_DE

100) -> None | str:

101 """Get content from a Wikipedia page and return it."""

102 response = await AsyncHTTPClient().fetch(

103 (

104 f"{api}?action=query&prop=extracts&exsectionformat=plain&exintro&"

105 f"titles={quote_url(page_name)}&explaintext&format=json&exsentences=5"

106 ),

107 ca_certs=CA_BUNDLE_PATH,

108 )

109 response_json = json.loads(response.body)

110 if "query" not in response_json or "pages" not in response_json["query"]:

111 return None

112 pages: dict[str, str] = response_json["query"]["pages"]

113 page = cast(dict[str, str], tuple(pages.values())[0])

114 if "extract" not in page:

115 return None

116 return page["extract"]

117

118

119def fix_author_for_wikipedia_search(author: str) -> str:

120 """

121 Fix author for Wikipedia search.

122

123 This tries to reduce common problems with authors.

124 So that we can show more information.

125 """

126 author = regex.sub(r"\s+", " ", author)

127 author = regex.sub(r"\s*$.*$", "", author)

128 author = regex.sub(r"\s*Werbespruch$", "", author, regex.IGNORECASE)

129 author = regex.sub(r"\s*Werbung$", "", author, regex.IGNORECASE)

130 author = regex.sub(r"^nach\s*", "", author, regex.IGNORECASE)

131 author = regex.sub(r"^Ein\s+", "", author, regex.IGNORECASE)

132 return author

133

134

135# time to live in seconds (1 month)

136AUTHOR_INFO_NEW_TTL: Final[int] = 60 * 60 * 24 * 30

137

138

139class AuthorsInfoPage(HTMLRequestHandler):

140 """The request handler used for the info page."""

141

142 RATELIMIT_GET_LIMIT = 5

143

144 async def get(self, id_str: str, *, head: bool = False) -> None:

145 """Handle GET requests to the author info page."""

146 author_id: int = int(id_str)

147 author = await get_author_by_id(author_id)

148 if head:

149 return

150 if author.info is None:

151 result = None

152 fixed_author_name = fix_author_for_wikipedia_search(author.name)

153 if EVENT_REDIS.is_set():

154 # try to get the info from Redis

155 result = await self.redis.get(

156 self.get_redis_info_key(fixed_author_name)

157 )

158 if result and (len(info := result.split("|", maxsplit=1)) > 1):

159 remaining_ttl = await self.redis.ttl(

160 self.get_redis_info_key(fixed_author_name)

161 )

162 creation_date = datetime.now(tz=timezone.utc) - timedelta(

163 seconds=AUTHOR_INFO_NEW_TTL - remaining_ttl

164 )

165 if len(info) == 1:

166 author.info = (info[0], None, creation_date)

167 else:

168 author.info = (info[0], info[1], creation_date)

169 else:

170 author.info = await search_wikipedia(fixed_author_name)

171 if author.info is None or author.info[1] is None:

172 # nothing found

173 LOGGER.info("No information found about %s", repr(author))

174 elif EVENT_REDIS.is_set():

175 await self.redis.setex(

176 self.get_redis_info_key(fixed_author_name),

177 AUTHOR_INFO_NEW_TTL,

178 # value to save (the author info)

179 # type is ignored, because author.info[1] is not None

180 "|".join(author.info[0:2]), # type: ignore[arg-type]

181 )

182

183 wqs = get_wrong_quotes(

184 lambda wq: wq.author_id == author_id,

185 sort=True,

186 )

187

188 await self.render(

189 "pages/quotes/author_info.html",

190 author=author,

191 wrong_quotes=wqs,

192 title="Autor-Informationen",

193 short_title="Autor-Info",

194 type="Autor",

195 id=author_id,

196 text=str(author),

197 description=f"Falsch zugeordnete Zitate mit „{author}“ als Autor.",

198 create_kwargs={"author": author_id},

199 )

200

201 def get_redis_info_key(self, author_name: str) -> str:

202 """Get the key to save the author info with Redis."""

203 return f"{self.redis_prefix}:quote-author-info:{author_name}"