libStatGen Software  1
SamTags Class Reference

Class for parsing/creating/operating on SAM/BAM record tags. More...

#include <SamTags.h>

Constants for parsing tags.

static const char * BQ_TAG = "BQ"
 
static const char BQ_TAG_TYPE = 'Z'
 
static const char * MD_TAG = "MD"
 
static const char MD_TAG_TYPE = 'Z'
 
static const char * ORIG_POS_TAG = "OP"
 
static const char ORIG_POS_TAG_TYPE = 'i'
 
static const char * ORIG_CIGAR_TAG = "OC"
 
static const char ORIG_CIGAR_TAG_TYPE = 'Z'
 
static const char * ORIG_QUAL_TAG = "OQ"
 
static const char ORIG_QUAL_TAG_TYPE = 'Z'
 
static bool createMDTag (String &outputMDtag, SamRecord &inputRec, GenomeSequence &genome)
 Create the MD tag for the specified input record and the genome. More...
 
static bool isMDTagCorrect (SamRecord &inputRec, GenomeSequence &genome)
 Check to see if the MD tag in the record is accurate.
 
static bool updateMDTag (SamRecord &inputRec, GenomeSequence &genome)
 

Detailed Description

Class for parsing/creating/operating on SAM/BAM record tags.

Definition at line 26 of file SamTags.h.

Member Function Documentation

◆ createMDTag()

bool SamTags::createMDTag ( String outputMDtag,
SamRecord inputRec,
GenomeSequence genome 
)
static

Create the MD tag for the specified input record and the genome.

Returns
returns true if an MD tag was created, false if one could not be created.

Definition at line 34 of file SamTags.cpp.

36 {
37  outputMDtag.Clear();
38  // Get the cigar to use for determing alignment.
39  Cigar* cigarInfo = inputRec.getCigarInfo();
40  if(cigarInfo == NULL)
41  {
42  throw(std::runtime_error("Cannot createMDTag - failed to get the cigar"));
43  return(false);
44  }
45  int32_t queryIndex = Cigar::INDEX_NA;
46 
47  // get where this read starts on the reference.
48  uint32_t startOfReadOnRefIndex =
49  genome.getGenomePosition(inputRec.getReferenceName());
50  if(startOfReadOnRefIndex == (uint32_t)INVALID_CHROMOSOME_INDEX)
51  {
52  // Failed to find the reference for this chromosome, so return false.
53  return(false);
54  }
55  startOfReadOnRefIndex += inputRec.get0BasedPosition();
56 
57  // Track the number of consecutive matches.
58  int32_t matchCount = 0;
59  // Track if it is currently in a deletion so it knows when not to add
60  // a '^'.
61  bool currentDeletion = false;
62 
63  // Loop through the Reference indices (ignores insertions/pads/clips).
64  for(int refOffset = 0;
65  refOffset < cigarInfo->getExpectedReferenceBaseCount();
66  ++refOffset)
67  {
68  // Get the query index for this reference position..
69  queryIndex = cigarInfo->getQueryIndex(refOffset);
70 
71  char refBase = genome[startOfReadOnRefIndex + refOffset];
72 
73  if(queryIndex != Cigar::INDEX_NA)
74  {
75  // Both the reference and the read have a base, so get the bases.
76  char readBase = inputRec.getSequence(queryIndex);
77  currentDeletion = false;
78 
79  // If neither base is unknown and they are the same, count it
80  // as a match.
81  if(!BaseUtilities::isAmbiguous(readBase) &&
82  !BaseUtilities::isAmbiguous(refBase) &&
83  (BaseUtilities::areEqual(readBase, refBase)))
84  {
85  // Match, so update counter.
86  ++matchCount;
87  }
88  else
89  {
90  // Mismatch, so output the number of matches if any.
91  if(matchCount != 0)
92  {
93  outputMDtag += matchCount;
94  matchCount = 0;
95  }
96  outputMDtag += refBase;
97  }
98  }
99  else
100  {
101  // This reference position is not in the query, so it is a deletion.
102  // Deletion, so output the number of matches if any.
103  if(matchCount != 0)
104  {
105  outputMDtag += matchCount;
106  matchCount = 0;
107  }
108 
109  if(!currentDeletion)
110  {
111  // Not currently in a deletion, so add the ^
112  outputMDtag += '^';
113  }
114  // Add the deleted base.
115  outputMDtag += refBase;
116  currentDeletion = true;
117  }
118  }
119 
120  // output the match count at the end.
121  outputMDtag += matchCount;
122  return(true);
123 }

References BaseUtilities::areEqual(), SamRecord::get0BasedPosition(), SamRecord::getCigarInfo(), Cigar::getExpectedReferenceBaseCount(), GenomeSequence::getGenomePosition(), Cigar::getQueryIndex(), SamRecord::getReferenceName(), SamRecord::getSequence(), Cigar::INDEX_NA, and BaseUtilities::isAmbiguous().

Referenced by isMDTagCorrect(), and SamValidator::isValidTags().


The documentation for this class was generated from the following files:
Cigar
This class represents the CIGAR without any methods to set the cigar (see CigarRoller for that).
Definition: Cigar.h:83
SamRecord::get0BasedPosition
int32_t get0BasedPosition()
Get the 0-based(BAM) leftmost position of the record.
Definition: SamRecord.cpp:1307
SamRecord::getReferenceName
const char * getReferenceName()
Get the reference sequence name (RNAME) of the record.
Definition: SamRecord.cpp:1286
BaseUtilities::isAmbiguous
static bool isAmbiguous(char base)
Returns whether or not the specified bases is an indicator for ambiguity.
Definition: BaseUtilities.cpp:23
Cigar::INDEX_NA
static const int32_t INDEX_NA
Value associated with an index that is not applicable/does not exist, used for converting between que...
Definition: Cigar.h:492
Cigar::getExpectedReferenceBaseCount
int getExpectedReferenceBaseCount() const
Return the number of bases in the reference that this CIGAR "spans".
Definition: Cigar.cpp:120
SamRecord::getSequence
const char * getSequence()
Returns the SAM formatted sequence string (SEQ), translating the base as specified by setSequenceTran...
Definition: SamRecord.cpp:1556
Cigar::getQueryIndex
int32_t getQueryIndex(int32_t refOffset)
Return the query index associated with the specified reference offset or INDEX_NA based on this cigar...
Definition: Cigar.cpp:202
BaseUtilities::areEqual
static bool areEqual(char base1, char base2)
Returns whether or not two bases are equal (case insensitive), if one of the bases is '=',...
Definition: BaseUtilities.cpp:39
SamRecord::getCigarInfo
Cigar * getCigarInfo()
Returns a pointer to the Cigar object associated with this record.
Definition: SamRecord.cpp:1824
GenomeSequence::getGenomePosition
genomeIndex_t getGenomePosition(const char *chromosomeName, unsigned int chromosomeIndex) const
given a chromosome name and position, return the genome position
Definition: GenomeSequence.cpp:779