diff --git a/page_content_extractor/html.py b/page_content_extractor/html.py
index 5ca970c..f76a655 100644
--- a/page_content_extractor/html.py
+++ b/page_content_extractor/html.py
@@ -143,7 +143,8 @@ def get_meta_description(self):
for desc in descs:
content = desc.get('content', '')
if len(content) > len(self._meta_desc):
- self._meta_desc = content
+ # Reason to escape https://github.com/berthubert/trifecta/issues/38
+ self._meta_desc = escape(content)
return self._meta_desc
def get_meta_image(self):
diff --git a/test/test_html_parser.py b/test/test_html_parser.py
index 26cf4d8..cb36a9e 100644
--- a/test/test_html_parser.py
+++ b/test/test_html_parser.py
@@ -247,6 +247,15 @@ def test_longer_meta_description(self):
parser = HtmlContentExtractor(html_doc)
self.assertEqual(parser.get_meta_description(), "aaaa")
+ def test_need_escape_unsafe_meta_description(self):
+ content = "The upload handler checks that the content type starts with "image/", but this check includes the image/svg+xml content type, so the following image is accepted: <?xml version="1.0" encoding="UTF-8" standalone="no"?> <svg xmlns="http://w..."
+ html_doc = f"""
+
+ """
+ parser = HtmlContentExtractor(html_doc)
+ # " == "
+ self.assertEqual(parser.get_meta_description(), content.replace('"', '"'))
+
def test_get_all_meta_images(self):
src = 'https://opengraph.githubassets.com/740568cb37e42d5beb5c65378e1f66a0a72e5cb1650c8a45df4466e9472825a2/tikv/agatedb'
html_doc = f"""