Skip to content

Commit

Permalink
Added ExcessiveEndClippedReadFilter (#7638)
Browse files Browse the repository at this point in the history
* Added ExcessiveEndClippedReadFilter

- Added ExcessiveEndClippedReadFilter - a filter that will keep reads
that have fewer than the specified number of clipped bases on either
end.  This is designed for long reads, and has a default value of 1k.
  • Loading branch information
jonn-smith committed Jan 14, 2022
1 parent 5b87367 commit caa48f9
Show file tree
Hide file tree
Showing 3 changed files with 167 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ private ReadFilterArgumentDefinitions(){}
public static final String FILTER_TOO_SHORT_NAME = "filter-too-short";
public static final String DONT_REQUIRE_SOFT_CLIPS_BOTH_ENDS_NAME = "dont-require-soft-clips-both-ends";

public static final String MAXIMUM_END_CLIPPED_BASES = "max-clipped-bases";

public static final String PL_FILTER_NAME_LONG_NAME = "platform-filter-name";

public static final String BLACK_LISTED_LANES_LONG_NAME = "black-listed-lanes";
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
package org.broadinstitute.hellbender.engine.filters;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.ReadFilterArgumentDefinitions;
import org.broadinstitute.hellbender.utils.help.HelpConstants;
import org.broadinstitute.hellbender.utils.read.GATKRead;

import java.util.Collections;
import java.util.List;

/**
* Filter out reads where the number of soft-/hard-clipped bases on either end is above a certain threshold.
*
* This filter will filter reads where the number of soft clips on one end of the read is too large. It does NOT
* check the total number of bases clipped on BOTH ends. For the purposes of this filter, the count of soft and hard
* clipped bases on either end of a read are combined.
*
* For example, a read with the following cigar strings will be filtered out (given maxClippedBases=1000):
*
* 1500S0000M
* 900H300S50000M
* 1001H50000M
*
* However, this read will NOT be filtered out (given maxClippedBases=1000):
*
* 900S50000M900S
* 150H800S50000M900S150H
* 1000S500M
* 500M1000S
*
* Note: This is designed for use with (unpaired) long reads, but can be used with any reads.
*/
@DocumentedFeature(
groupName= HelpConstants.DOC_CAT_READFILTERS,
groupSummary=HelpConstants.DOC_CAT_READFILTERS_SUMMARY,
summary = "Filter out reads that have too many clipped bases on either end."
)
public final class ExcessiveEndClippedReadFilter extends ReadFilter {
static final long serialVersionUID = 1L;

@Argument(fullName = ReadFilterArgumentDefinitions.MAXIMUM_END_CLIPPED_BASES,
doc = "Maximum number of clipped bases on either end of a given read",
optional = true)
private int maxClippedBases = 1000;

// Command line parser requires a no-arg constructor
public ExcessiveEndClippedReadFilter() {}

@VisibleForTesting
ExcessiveEndClippedReadFilter(final int maxClippedBases) {
this.maxClippedBases = maxClippedBases;
}

@Override
public boolean test(final GATKRead read) {
final List<CigarElement> cigarElements = read.getCigarElements();

// Check front end of the read:
int clippedBases = 0;
for ( final CigarElement element : cigarElements) {
if ( element.getOperator() != CigarOperator.S && element.getOperator() != CigarOperator.H) {
break;
}
clippedBases += element.getLength();
}
// If we fail on the front end we can quit now:
if (clippedBases > maxClippedBases) {
return false;
}

// Check back end of the read:
clippedBases = 0;
for ( int i = cigarElements.size() - 1; i >= 0; --i ) {
final CigarElement element = cigarElements.get(i);
if ( element.getOperator() != CigarOperator.S && element.getOperator() != CigarOperator.H) {
break;
}
clippedBases += element.getLength();
}
return clippedBases <= maxClippedBases;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package org.broadinstitute.hellbender.engine.filters;

import htsjdk.samtools.Cigar;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.TextCigarCodec;
import org.broadinstitute.hellbender.GATKBaseTest;
import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;


public class ExcessiveEndClippedReadFilterUnitTest extends GATKBaseTest {

private static final int CHR_COUNT = 1;
private static final int CHR_START = 1;
private static final int CHR_SIZE = 1000;
private static final int GROUP_COUNT = 5;

private final SAMFileHeader header= ArtificialReadUtils.createArtificialSamHeaderWithGroups(CHR_COUNT, CHR_START, CHR_SIZE, GROUP_COUNT);

private GATKRead buildSAMRead(final String cigarString) {
final Cigar cigar = TextCigarCodec.decode(cigarString);
return ArtificialReadUtils.createArtificialRead(header, cigar);
}

@Test(dataProvider= "provideForTestExcessiveEndClipFilter")
public void testExcessiveEndClipFilter(final int maxClippedBases,
final String cigarString,
final boolean expectedResult) {

final ExcessiveEndClippedReadFilter filter = new ExcessiveEndClippedReadFilter(maxClippedBases);
final GATKRead read = buildSAMRead(cigarString);
Assert.assertEquals(filter.test(read), expectedResult, cigarString);
}

@DataProvider(name = "provideForTestExcessiveEndClipFilter")
public Object[][] overclippedDataProvider() {
return new Object[][] {
// Failing test cases:
{ 50, "1S10M", true },
{ 50, "10M1S", true },
{ 50, "49S10M", true },
{ 50, "50S10M", true },
{ 50, "10M49S", true },
{ 50, "10M50S", true },
{ 50, "49S10M1S", true },
{ 50, "50S10M1S", true },
{ 50, "1S10M49S", true },
{ 50, "1S10M50S", true },
{ 50, "49H10M1S", true },
{ 50, "50H10M1S", true },
{ 50, "1H10M49S", true },
{ 50, "1H10M50S", true },
{ 50, "49H10M49H", true },
{ 50, "50H10M50H", true },
{ 50, "40H9S10M", true },
{ 50, "40H10S10M", true },
{ 50, "10M40H9S", true },
{ 50, "10M40H10S", true },

// Passing test cases:
{ 50, "10M51S", false },
{ 50, "51S10M", false },
{ 50, "51S10M51S", false },

{ 50, "10M51H", false },
{ 50, "51H10M", false },
{ 50, "51H10M51H", false },

{ 50, "10M26S26H", false },
{ 50, "26H26S10M", false },
{ 50, "26H26S10M26S26H", false },
};
}
}

0 comments on commit caa48f9

Please sign in to comment.