diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/ReadFilterArgumentDefinitions.java b/src/main/java/org/broadinstitute/hellbender/cmdline/ReadFilterArgumentDefinitions.java index 51df98ffc58..8acc8966928 100644 --- a/src/main/java/org/broadinstitute/hellbender/cmdline/ReadFilterArgumentDefinitions.java +++ b/src/main/java/org/broadinstitute/hellbender/cmdline/ReadFilterArgumentDefinitions.java @@ -30,6 +30,8 @@ private ReadFilterArgumentDefinitions(){} public static final String FILTER_TOO_SHORT_NAME = "filter-too-short"; public static final String DONT_REQUIRE_SOFT_CLIPS_BOTH_ENDS_NAME = "dont-require-soft-clips-both-ends"; + public static final String MAXIMUM_END_CLIPPED_BASES = "max-clipped-bases"; + public static final String PL_FILTER_NAME_LONG_NAME = "platform-filter-name"; public static final String BLACK_LISTED_LANES_LONG_NAME = "black-listed-lanes"; diff --git a/src/main/java/org/broadinstitute/hellbender/engine/filters/ExcessiveEndClippedReadFilter.java b/src/main/java/org/broadinstitute/hellbender/engine/filters/ExcessiveEndClippedReadFilter.java new file mode 100644 index 00000000000..fb1343e0d82 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/engine/filters/ExcessiveEndClippedReadFilter.java @@ -0,0 +1,88 @@ +package org.broadinstitute.hellbender.engine.filters; + +import com.google.common.annotations.VisibleForTesting; +import htsjdk.samtools.CigarElement; +import htsjdk.samtools.CigarOperator; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.ReadFilterArgumentDefinitions; +import org.broadinstitute.hellbender.utils.help.HelpConstants; +import org.broadinstitute.hellbender.utils.read.GATKRead; + +import java.util.Collections; +import java.util.List; + +/** + * Filter out reads where the number of soft-/hard-clipped bases on either end is above a certain threshold. + * + * This filter will filter reads where the number of soft clips on one end of the read is too large. It does NOT + * check the total number of bases clipped on BOTH ends. For the purposes of this filter, the count of soft and hard + * clipped bases on either end of a read are combined. + * + * For example, a read with the following cigar strings will be filtered out (given maxClippedBases=1000): + * + * 1500S0000M + * 900H300S50000M + * 1001H50000M + * + * However, this read will NOT be filtered out (given maxClippedBases=1000): + * + * 900S50000M900S + * 150H800S50000M900S150H + * 1000S500M + * 500M1000S + * + * Note: This is designed for use with (unpaired) long reads, but can be used with any reads. + */ +@DocumentedFeature( + groupName= HelpConstants.DOC_CAT_READFILTERS, + groupSummary=HelpConstants.DOC_CAT_READFILTERS_SUMMARY, + summary = "Filter out reads that have too many clipped bases on either end." +) +public final class ExcessiveEndClippedReadFilter extends ReadFilter { + static final long serialVersionUID = 1L; + + @Argument(fullName = ReadFilterArgumentDefinitions.MAXIMUM_END_CLIPPED_BASES, + doc = "Maximum number of clipped bases on either end of a given read", + optional = true) + private int maxClippedBases = 1000; + + // Command line parser requires a no-arg constructor + public ExcessiveEndClippedReadFilter() {} + + @VisibleForTesting + ExcessiveEndClippedReadFilter(final int maxClippedBases) { + this.maxClippedBases = maxClippedBases; + } + + @Override + public boolean test(final GATKRead read) { + final List cigarElements = read.getCigarElements(); + + // Check front end of the read: + int clippedBases = 0; + for ( final CigarElement element : cigarElements) { + if ( element.getOperator() != CigarOperator.S && element.getOperator() != CigarOperator.H) { + break; + } + clippedBases += element.getLength(); + } + // If we fail on the front end we can quit now: + if (clippedBases > maxClippedBases) { + return false; + } + + // Check back end of the read: + clippedBases = 0; + for ( int i = cigarElements.size() - 1; i >= 0; --i ) { + final CigarElement element = cigarElements.get(i); + if ( element.getOperator() != CigarOperator.S && element.getOperator() != CigarOperator.H) { + break; + } + clippedBases += element.getLength(); + } + return clippedBases <= maxClippedBases; + } +} diff --git a/src/test/java/org/broadinstitute/hellbender/engine/filters/ExcessiveEndClippedReadFilterUnitTest.java b/src/test/java/org/broadinstitute/hellbender/engine/filters/ExcessiveEndClippedReadFilterUnitTest.java new file mode 100644 index 00000000000..ee724fdad47 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/engine/filters/ExcessiveEndClippedReadFilterUnitTest.java @@ -0,0 +1,77 @@ +package org.broadinstitute.hellbender.engine.filters; + +import htsjdk.samtools.Cigar; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.TextCigarCodec; +import org.broadinstitute.hellbender.GATKBaseTest; +import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils; +import org.broadinstitute.hellbender.utils.read.GATKRead; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + + +public class ExcessiveEndClippedReadFilterUnitTest extends GATKBaseTest { + + private static final int CHR_COUNT = 1; + private static final int CHR_START = 1; + private static final int CHR_SIZE = 1000; + private static final int GROUP_COUNT = 5; + + private final SAMFileHeader header= ArtificialReadUtils.createArtificialSamHeaderWithGroups(CHR_COUNT, CHR_START, CHR_SIZE, GROUP_COUNT); + + private GATKRead buildSAMRead(final String cigarString) { + final Cigar cigar = TextCigarCodec.decode(cigarString); + return ArtificialReadUtils.createArtificialRead(header, cigar); + } + + @Test(dataProvider= "provideForTestExcessiveEndClipFilter") + public void testExcessiveEndClipFilter(final int maxClippedBases, + final String cigarString, + final boolean expectedResult) { + + final ExcessiveEndClippedReadFilter filter = new ExcessiveEndClippedReadFilter(maxClippedBases); + final GATKRead read = buildSAMRead(cigarString); + Assert.assertEquals(filter.test(read), expectedResult, cigarString); + } + + @DataProvider(name = "provideForTestExcessiveEndClipFilter") + public Object[][] overclippedDataProvider() { + return new Object[][] { + // Failing test cases: + { 50, "1S10M", true }, + { 50, "10M1S", true }, + { 50, "49S10M", true }, + { 50, "50S10M", true }, + { 50, "10M49S", true }, + { 50, "10M50S", true }, + { 50, "49S10M1S", true }, + { 50, "50S10M1S", true }, + { 50, "1S10M49S", true }, + { 50, "1S10M50S", true }, + { 50, "49H10M1S", true }, + { 50, "50H10M1S", true }, + { 50, "1H10M49S", true }, + { 50, "1H10M50S", true }, + { 50, "49H10M49H", true }, + { 50, "50H10M50H", true }, + { 50, "40H9S10M", true }, + { 50, "40H10S10M", true }, + { 50, "10M40H9S", true }, + { 50, "10M40H10S", true }, + + // Passing test cases: + { 50, "10M51S", false }, + { 50, "51S10M", false }, + { 50, "51S10M51S", false }, + + { 50, "10M51H", false }, + { 50, "51H10M", false }, + { 50, "51H10M51H", false }, + + { 50, "10M26S26H", false }, + { 50, "26H26S10M", false }, + { 50, "26H26S10M26S26H", false }, + }; + } +}