From a60f7a19c0b418fe95fcf2ec0957005ae39e1090 Mon Sep 17 00:00:00 2001 From: Anish Athalye Date: Wed, 22 Jan 2020 16:07:06 -0500 Subject: [PATCH] Fix compatibility with Jython MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch was taken from https://github.com/yaml/pyyaml/issues/369#issuecomment-571596545, authored by Pekka Klärck . In short, Jython doesn't support lone surrogates, so importing yaml (and in particular, loading `reader.py`) caused a UnicodeDecodeError. This patch works around this through a clever use of `eval` to defer evaluation of the string containing the lone surrogates, only doing it on non-Jython platforms. This is only done in `lib/yaml/reader.py` and not `lib3/yaml/reader.py` because Jython does not support Python 3. With this patch, Jython's behavior with respect to Unicode code points over 0xFFFF becomes as it was before 0716ae21a1e7ab6b4ef73428c0c8fff49685d057. It still does not pass all the unit tests on Jython (passes 1275, fails 3, errors on 1); all the failing tests are related to unicode. Still, this is better than simply crashing upon `import yaml`. With this patch, all tests continue to pass on Python 2 / Python 3. --- lib/yaml/reader.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/lib/yaml/reader.py b/lib/yaml/reader.py index 4b377d61..4c421509 100644 --- a/lib/yaml/reader.py +++ b/lib/yaml/reader.py @@ -137,9 +137,14 @@ def determine_encoding(self): self.update(1) if has_ucs4: - NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD\U00010000-\U0010ffff]') + NON_PRINTABLE = u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD\U00010000-\U0010ffff]' + elif sys.platform.startswith('java'): + # Jython doesn't support lone surrogates https://bugs.jython.org/issue2048 + NON_PRINTABLE = u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]' else: - NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uFFFD]|(?:^|[^\uD800-\uDBFF])[\uDC00-\uDFFF]|[\uD800-\uDBFF](?:[^\uDC00-\uDFFF]|$)') + # Need to use eval here due to the above Jython issue + NON_PRINTABLE = eval(r"u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uFFFD]|(?:^|[^\uD800-\uDBFF])[\uDC00-\uDFFF]|[\uD800-\uDBFF](?:[^\uDC00-\uDFFF]|$)'") + NON_PRINTABLE = re.compile(NON_PRINTABLE) def check_printable(self, data): match = self.NON_PRINTABLE.search(data) if match: