From 6f9f2fd4d0bf4c08425722df5ba67ad908cd851f Mon Sep 17 00:00:00 2001 From: miaochangxin Date: Mon, 22 Jan 2024 01:21:10 +0800 Subject: [PATCH] Also escape meta descriptions --- page_content_extractor/html.py | 3 ++- test/test_html_parser.py | 9 +++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/page_content_extractor/html.py b/page_content_extractor/html.py index 5ca970c..f76a655 100644 --- a/page_content_extractor/html.py +++ b/page_content_extractor/html.py @@ -143,7 +143,8 @@ def get_meta_description(self): for desc in descs: content = desc.get('content', '') if len(content) > len(self._meta_desc): - self._meta_desc = content + # Reason to escape https://github.com/berthubert/trifecta/issues/38 + self._meta_desc = escape(content) return self._meta_desc def get_meta_image(self): diff --git a/test/test_html_parser.py b/test/test_html_parser.py index 26cf4d8..cb36a9e 100644 --- a/test/test_html_parser.py +++ b/test/test_html_parser.py @@ -247,6 +247,15 @@ def test_longer_meta_description(self): parser = HtmlContentExtractor(html_doc) self.assertEqual(parser.get_meta_description(), "aaaa") + def test_need_escape_unsafe_meta_description(self): + content = "The upload handler checks that the content type starts with "image/", but this check includes the image/svg+xml content type, so the following image is accepted: <?xml version="1.0" encoding="UTF-8" standalone="no"?> <svg xmlns="http://w..." + html_doc = f""" + + """ + parser = HtmlContentExtractor(html_doc) + # " == " + self.assertEqual(parser.get_meta_description(), content.replace('"', '"')) + def test_get_all_meta_images(self): src = 'https://opengraph.githubassets.com/740568cb37e42d5beb5c65378e1f66a0a72e5cb1650c8a45df4466e9472825a2/tikv/agatedb' html_doc = f"""