From 43b2db470212bf4c5a8810535c73e92366158958 Mon Sep 17 00:00:00 2001 From: Viktor Holzwert Date: Wed, 6 Dec 2023 13:11:56 +0100 Subject: [PATCH] WhitespaceNormalizeEnhancer: normalize whitespace and character encodings --- .../impl/WhitespaceNormalizeEnhancer.java | 122 ++++++++++++++++++ .../config/spring/api/metadata-enhancers.xml | 11 ++ 2 files changed, 133 insertions(+) create mode 100644 dspace-api/src/main/java/org/dspace/content/enhancer/impl/WhitespaceNormalizeEnhancer.java diff --git a/dspace-api/src/main/java/org/dspace/content/enhancer/impl/WhitespaceNormalizeEnhancer.java b/dspace-api/src/main/java/org/dspace/content/enhancer/impl/WhitespaceNormalizeEnhancer.java new file mode 100644 index 000000000000..7ef51ee00cb8 --- /dev/null +++ b/dspace-api/src/main/java/org/dspace/content/enhancer/impl/WhitespaceNormalizeEnhancer.java @@ -0,0 +1,122 @@ +/** + * The contents of this file are subject to the license and copyright + * detailed in the LICENSE and NOTICE files at the root of the source + * tree and available online at + * + * http://www.dspace.org/license/ + */ +package org.dspace.content.enhancer.impl; + +import java.text.Normalizer; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +import org.dspace.content.Item; +import org.dspace.content.MetadataValue; +import org.dspace.content.service.ItemService; +import org.dspace.content.service.MetadataFieldService; +import org.dspace.core.Context; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; + +public class WhitespaceNormalizeEnhancer extends RewriteEnhancer { + protected static final Logger LOGGER = LoggerFactory.getLogger(WhitespaceNormalizeEnhancer.class); + private List metaDataFields; + + @Autowired + protected ItemService itemService; + + @Autowired + protected MetadataFieldService metadatafieldService; + + protected String sourceEntityType; + + public void setSourceEntityType(String sourceEntityType) { + this.sourceEntityType = sourceEntityType; + } + + @Override + public boolean canEnhance(Context context, Item item) { + return sourceEntityType == null || sourceEntityType.equals(itemService.getEntityType(item)); + } + + @Override + public void enhance(Context context, Item item) { + HashMap> newValues = new HashMap<>(); + boolean modified = false; + for (String metaDataField : metaDataFields) { + List values = itemService.getMetadataByMetadataString(item, metaDataField); + if (values == null) { + continue; + } + for (MetadataValue mv : values) { + String text = mv.getValue(); + String textCopy = text; + text = replaceSpecialSpaces(text); + text = replaceSpecialNewline(text); + text = replaceNewlineToReturnNewline(text); + text = removeWhitespacesBeforeReturnNewline(text); + text = removeZeroWidthSpaces(text); + text = normalizeToNFC(text); + if (!newValues.containsKey(metaDataField)) { + newValues.put(metaDataField, new ArrayList<>()); + } + if (text.equals(textCopy)) { + newValues.get(metaDataField).add(textCopy); + } else { + newValues.get(metaDataField).add(text); + modified = true; + } + } + } + if (modified) { + updateItem(context, item, newValues); + } + } + + private static String doWhileReplaceAll(String original, String regex, String replacement) { + String temp; + do { + temp = original; + original = temp.replaceAll(regex, replacement); + } + while (!temp.equals(original)); + return original; + } + + private static String replaceSpecialSpaces(String original) { + String regex = "(\\u0009|\\u00A0|\\u1680|\\u2000|\\u2001|\\u2002|\\u2003|\\u2004|\\u2005|" + + "\\u2006|\\u2007|\\u2008|\\u2009|\\u200A|\\u202F|\\u205F|\\u3000)+|(\\u0020){2,}"; + return doWhileReplaceAll(original, regex, " "); + } + + private static String replaceSpecialNewline(String original) { + String regex = "(\\u000B|\\u000C|\\u0085|\\u2028|\\u2029)"; + return doWhileReplaceAll(original, regex, "\n"); + } + + private static String replaceNewlineToReturnNewline(String original) { + String regex = "(? metaDataFields) { + this.metaDataFields = metaDataFields; + } +} diff --git a/dspace/config/spring/api/metadata-enhancers.xml b/dspace/config/spring/api/metadata-enhancers.xml index eb87c3202691..063be1fdf363 100644 --- a/dspace/config/spring/api/metadata-enhancers.xml +++ b/dspace/config/spring/api/metadata-enhancers.xml @@ -71,4 +71,15 @@ + + + + + dc.title + dc.title.alternative + dc.description.abstract + + + +