Skip to content

Commit

Permalink
Also escape meta descriptions
Browse files Browse the repository at this point in the history
  • Loading branch information
polyrabbit committed Jan 21, 2024
1 parent 9db6d57 commit 6f9f2fd
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 1 deletion.
3 changes: 2 additions & 1 deletion page_content_extractor/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,8 @@ def get_meta_description(self):
for desc in descs:
content = desc.get('content', '')
if len(content) > len(self._meta_desc):
self._meta_desc = content
# Reason to escape https://github.com/berthubert/trifecta/issues/38
self._meta_desc = escape(content)
return self._meta_desc

def get_meta_image(self):
Expand Down
9 changes: 9 additions & 0 deletions test/test_html_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,15 @@ def test_longer_meta_description(self):
parser = HtmlContentExtractor(html_doc)
self.assertEqual(parser.get_meta_description(), "aaaa")

def test_need_escape_unsafe_meta_description(self):
content = "The upload handler checks that the content type starts with "image/", but this check includes the image/svg+xml content type, so the following image is accepted: <?xml version="1.0" encoding="UTF-8" standalone="no"?> <svg xmlns="http://w..."
html_doc = f"""
<meta name="description" content="{content}">
"""
parser = HtmlContentExtractor(html_doc)
# &quot; == &#34;
self.assertEqual(parser.get_meta_description(), content.replace('&quot;', '&#34;'))

def test_get_all_meta_images(self):
src = 'https://opengraph.githubassets.com/740568cb37e42d5beb5c65378e1f66a0a72e5cb1650c8a45df4466e9472825a2/tikv/agatedb'
html_doc = f"""
Expand Down

0 comments on commit 6f9f2fd

Please sign in to comment.