Merge pull request #277 from europeana/EA-3593_fixIllegalCharacterError

EA-3593 fix illegal character - replaced ByteArray Streams with Strin…
europeana · Nov 8, 2023 · b54f44a · b54f44a
2 parents 3c6fe55 + 2bdcde2
commit b54f44a
Show file tree

Hide file tree

Showing 2 changed files with 112 additions and 5 deletions.
diff --git a/corelib-storage/src/main/java/eu/europeana/corelib/edm/utils/EdmUtils.java b/corelib-storage/src/main/java/eu/europeana/corelib/edm/utils/EdmUtils.java
@@ -19,11 +19,10 @@
 import org.jibx.runtime.IMarshallingContext;
 import org.jibx.runtime.JiBXException;
 
-import java.io.ByteArrayOutputStream;
 import java.io.IOException;
+import java.io.StringWriter;
 import java.lang.reflect.InvocationTargetException;
 import java.lang.reflect.Method;
-import java.nio.charset.StandardCharsets;
 import java.util.*;
 import java.util.Map.Entry;
 
@@ -82,11 +81,11 @@ public static synchronized String toEDM(RDF rdf) {
 
     private static String marshallToEDM(RDF rdf) {
         IMarshallingContext marshallingContext;
-        try (ByteArrayOutputStream out = new ByteArrayOutputStream()){
+        try (StringWriter out = new StringWriter()){
             marshallingContext = bfact.createMarshallingContext();
-            marshallingContext.setOutput(out, null);
+            marshallingContext.setOutput(out, EuropeanaUTF8Escaper.s_instance);
             marshallingContext.marshalDocument(rdf, "UTF-8", true);
-            return out.toString(StandardCharsets.UTF_8);
+            return out.toString();
         } catch (JiBXException | IOException e) {
             String id = null;
             if (rdf != null && rdf.getProvidedCHOList() != null && !rdf.getProvidedCHOList().isEmpty()) {

diff --git a/corelib-storage/src/main/java/eu/europeana/corelib/edm/utils/EuropeanaUTF8Escaper.java b/corelib-storage/src/main/java/eu/europeana/corelib/edm/utils/EuropeanaUTF8Escaper.java
@@ -0,0 +1,108 @@
+package eu.europeana.corelib.edm.utils;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.jibx.runtime.ICharacterEscaper;
+
+import java.io.IOException;
+import java.io.Writer;
+
+public class EuropeanaUTF8Escaper  implements ICharacterEscaper{
+
+    private static final Logger LOG = LogManager.getLogger(EuropeanaUTF8Escaper.class);
+
+    public static final EuropeanaUTF8Escaper s_instance = new EuropeanaUTF8Escaper();
+
+    private EuropeanaUTF8Escaper() {
+    }
+
+    public void writeAttribute(String text, Writer writer) throws IOException {
+        int mark = 0;
+
+        for(int i = 0; i < text.length(); ++i) {
+            char chr = text.charAt(i);
+            if (chr == '"') {
+                writer.write(text, mark, i - mark);
+                mark = i + 1;
+                writer.write("&quot;");
+            } else if (chr == '&') {
+                writer.write(text, mark, i - mark);
+                mark = i + 1;
+                writer.write("&amp;");
+            } else if (chr == '<') {
+                writer.write(text, mark, i - mark);
+                mark = i + 1;
+                writer.write("&lt;");
+            } else if (chr == '>' && i > 2 && text.charAt(i - 1) == ']' && text.charAt(i - 2) == ']') {
+                writer.write(text, mark, i - mark - 2);
+                mark = i + 1;
+                writer.write("]]&gt;");
+            } else if (chr < ' ') {
+                if (chr != '\t' && chr != '\n' && chr != '\r') {
+                    LOG.error("Illegal Character code 0x{} : {} in attribute value text : {}", Integer.toHexString(chr), chr, text);
+                }
+            } else if (chr > '\ud7ff' && (chr < '\ue000' || chr == '\ufffe' || chr == '\uffff' || chr > 1114111)) {
+                LOG.error("Illegal Character code 0x{} : {} in attribute value text : {}", Integer.toHexString(chr), chr, text);
+            }
+        }
+
+        writer.write(text, mark, text.length() - mark);
+    }
+
+    public void writeContent(String text, Writer writer) throws IOException {
+        int mark = 0;
+
+        for(int i = 0; i < text.length(); ++i) {
+            char chr = text.charAt(i);
+            if (chr == '&') {
+                writer.write(text, mark, i - mark);
+                mark = i + 1;
+                writer.write("&amp;");
+            } else if (chr == '<') {
+                writer.write(text, mark, i - mark);
+                mark = i + 1;
+                writer.write("&lt;");
+            } else if (chr == '>' && i > 2 && text.charAt(i - 1) == ']' && text.charAt(i - 2) == ']') {
+                writer.write(text, mark, i - mark - 2);
+                mark = i + 1;
+                writer.write("]]&gt;");
+            } else if (chr < ' ') {
+                if (chr != '\t' && chr != '\n' && chr != '\r') {
+                    LOG.error("Illegal Character code 0x{} : {} in content text : {}", Integer.toHexString(chr), chr, text);
+                }
+            } else if (chr > '\ud7ff' && (chr < '\ue000' || chr == '\ufffe' || chr == '\uffff' || chr > 1114111)) {
+                LOG.error("Illegal Character code 0x{} : {} in content text : {}", Integer.toHexString(chr), chr, text);
+            }
+        }
+
+        writer.write(text, mark, text.length() - mark);
+    }
+
+    public void writeCData(String text, Writer writer) throws IOException {
+        writer.write("<![CDATA[");
+
+        for(int i = 0; i < text.length(); ++i) {
+            char chr = text.charAt(i);
+            if (chr == '>' && i > 2 && text.charAt(i - 1) == ']' && text.charAt(i - 2) == ']') {
+                throw new IOException("Sequence \"]]>\" is not allowed within CDATA section text");
+            }
+
+            if (chr < ' ') {
+                if (chr != '\t' && chr != '\n' && chr != '\r') {
+                    LOG.error("Illegal Character code 0x{} : {} in CDATA section. text : {}", Integer.toHexString(chr), chr, text);
+                }
+            } else if (chr > '\ud7ff' && (chr < '\ue000' || chr == '\ufffe' || chr == '\uffff')) {
+                LOG.error("Illegal Character code 0x{} : {} in CDATA section. text : {}", Integer.toHexString(chr), chr, text);
+            }
+        }
+
+        writer.write(text);
+        writer.write("]]>");
+    }
+
+    public static ICharacterEscaper getInstance() {
+        return s_instance;
+    }
+}
+
+