Skip to content

Commit

Permalink
Merge pull request #277 from europeana/EA-3593_fixIllegalCharacterError
Browse files Browse the repository at this point in the history
EA-3593 fix illegal character - replaced ByteArray Streams with Strin…
  • Loading branch information
SrishtiSingh-eu committed Nov 8, 2023
2 parents 3c6fe55 + 2bdcde2 commit b54f44a
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,10 @@
import org.jibx.runtime.IMarshallingContext;
import org.jibx.runtime.JiBXException;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.StringWriter;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.Map.Entry;

Expand Down Expand Up @@ -82,11 +81,11 @@ public static synchronized String toEDM(RDF rdf) {

private static String marshallToEDM(RDF rdf) {
IMarshallingContext marshallingContext;
try (ByteArrayOutputStream out = new ByteArrayOutputStream()){
try (StringWriter out = new StringWriter()){
marshallingContext = bfact.createMarshallingContext();
marshallingContext.setOutput(out, null);
marshallingContext.setOutput(out, EuropeanaUTF8Escaper.s_instance);
marshallingContext.marshalDocument(rdf, "UTF-8", true);
return out.toString(StandardCharsets.UTF_8);
return out.toString();
} catch (JiBXException | IOException e) {
String id = null;
if (rdf != null && rdf.getProvidedCHOList() != null && !rdf.getProvidedCHOList().isEmpty()) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
package eu.europeana.corelib.edm.utils;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jibx.runtime.ICharacterEscaper;

import java.io.IOException;
import java.io.Writer;

public class EuropeanaUTF8Escaper implements ICharacterEscaper{

private static final Logger LOG = LogManager.getLogger(EuropeanaUTF8Escaper.class);

public static final EuropeanaUTF8Escaper s_instance = new EuropeanaUTF8Escaper();

private EuropeanaUTF8Escaper() {
}

public void writeAttribute(String text, Writer writer) throws IOException {
int mark = 0;

for(int i = 0; i < text.length(); ++i) {
char chr = text.charAt(i);
if (chr == '"') {
writer.write(text, mark, i - mark);
mark = i + 1;
writer.write("&quot;");
} else if (chr == '&') {
writer.write(text, mark, i - mark);
mark = i + 1;
writer.write("&amp;");
} else if (chr == '<') {
writer.write(text, mark, i - mark);
mark = i + 1;
writer.write("&lt;");
} else if (chr == '>' && i > 2 && text.charAt(i - 1) == ']' && text.charAt(i - 2) == ']') {
writer.write(text, mark, i - mark - 2);
mark = i + 1;
writer.write("]]&gt;");
} else if (chr < ' ') {
if (chr != '\t' && chr != '\n' && chr != '\r') {
LOG.error("Illegal Character code 0x{} : {} in attribute value text : {}", Integer.toHexString(chr), chr, text);
}
} else if (chr > '\ud7ff' && (chr < '\ue000' || chr == '\ufffe' || chr == '\uffff' || chr > 1114111)) {
LOG.error("Illegal Character code 0x{} : {} in attribute value text : {}", Integer.toHexString(chr), chr, text);
}
}

writer.write(text, mark, text.length() - mark);
}

public void writeContent(String text, Writer writer) throws IOException {
int mark = 0;

for(int i = 0; i < text.length(); ++i) {
char chr = text.charAt(i);
if (chr == '&') {
writer.write(text, mark, i - mark);
mark = i + 1;
writer.write("&amp;");
} else if (chr == '<') {
writer.write(text, mark, i - mark);
mark = i + 1;
writer.write("&lt;");
} else if (chr == '>' && i > 2 && text.charAt(i - 1) == ']' && text.charAt(i - 2) == ']') {
writer.write(text, mark, i - mark - 2);
mark = i + 1;
writer.write("]]&gt;");
} else if (chr < ' ') {
if (chr != '\t' && chr != '\n' && chr != '\r') {
LOG.error("Illegal Character code 0x{} : {} in content text : {}", Integer.toHexString(chr), chr, text);
}
} else if (chr > '\ud7ff' && (chr < '\ue000' || chr == '\ufffe' || chr == '\uffff' || chr > 1114111)) {
LOG.error("Illegal Character code 0x{} : {} in content text : {}", Integer.toHexString(chr), chr, text);
}
}

writer.write(text, mark, text.length() - mark);
}

public void writeCData(String text, Writer writer) throws IOException {
writer.write("<![CDATA[");

for(int i = 0; i < text.length(); ++i) {
char chr = text.charAt(i);
if (chr == '>' && i > 2 && text.charAt(i - 1) == ']' && text.charAt(i - 2) == ']') {
throw new IOException("Sequence \"]]>\" is not allowed within CDATA section text");
}

if (chr < ' ') {
if (chr != '\t' && chr != '\n' && chr != '\r') {
LOG.error("Illegal Character code 0x{} : {} in CDATA section. text : {}", Integer.toHexString(chr), chr, text);
}
} else if (chr > '\ud7ff' && (chr < '\ue000' || chr == '\ufffe' || chr == '\uffff')) {
LOG.error("Illegal Character code 0x{} : {} in CDATA section. text : {}", Integer.toHexString(chr), chr, text);
}
}

writer.write(text);
writer.write("]]>");
}

public static ICharacterEscaper getInstance() {
return s_instance;
}
}


0 comments on commit b54f44a

Please sign in to comment.