libStatGen Software  1
Cigar.h
1 /*
2  * Copyright (C) 2010-2011 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #if !defined(_CIGAR_H)
19 #define _CIGAR_H
20 
21 #include <string.h> // for inline use of strcat, etc
22 #include <limits.h> // for INT_MAX
23 #include <stdint.h> // for uint32_t and friends
24 
25 
26 #include <assert.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string>
30 #include <iostream>
31 #include <iomanip>
32 #include <utility>
33 #include <vector>
34 
35 #include "Generic.h"
36 #include "StringBasics.h"
37 
38 /// This class represents the CIGAR without any methods to set the cigar
39 /// (see CigarRoller for that).
40 
41 //
42 // Docs from Sam1.pdf:
43 //
44 // Clipped alignment. In Smith-Waterman alignment, a sequence may not be aligned from the first residue to the last one.
45 // Subsequences at the ends may be clipped off. We introduce operation ʻSʼ to describe (softly) clipped alignment. Here is
46 // an example. Suppose the clipped alignment is:
47 // REF: AGCTAGCATCGTGTCGCCCGTCTAGCATACGCATGATCGACTGTCAGCTAGTCAGACTAGTCGATCGATGTG
48 // READ: gggGTGTAACC-GACTAGgggg
49 // where on the read sequence, bases in uppercase are matches and bases in lowercase are clipped off. The CIGAR for
50 // this alignment is: 3S8M1D6M4S.
51 //
52 //
53 // If the mapping position of the query is not available, RNAME and
54 // CIGAR are set as “*”
55 //
56 // A CIGAR string is comprised of a series of operation lengths plus the operations. The conventional CIGAR format allows
57 // for three types of operations: M for match or mismatch, I for insertion and D for deletion. The extended CIGAR format
58 // further allows four more operations, as is shown in the following table, to describe clipping, padding and splicing:
59 //
60 // op Description
61 // -- -----------
62 // M Match or mismatch
63 // I Insertion to the reference
64 // D Deletion from the reference
65 // N Skipped region from the reference
66 // S Soft clip on the read (clipped sequence present in <seq>)
67 // H Hard clip on the read (clipped sequence NOT present in <seq>)
68 // P Padding (silent deletion from the padded reference sequence)
69 //
70 
71 
72 
73 
74 ////////////////////////////////////////////////////////////////////////
75 ///
76 /// This class represents the CIGAR. It contains methods for converting
77 /// to strings and extracting information from the cigar on how a read
78 /// maps to the reference.
79 ///
80 /// It only contains read only methods. There are no ways to set
81 /// values. To set a value, a child class must be used.
82 ///
83 class Cigar
84 {
85 public:
86  /// Enum for the cigar operations.
87  enum Operation {
88  none=0, ///< no operation has been set.
89  match, ///< match/mismatch operation. Associated with CIGAR Operation "M"
90  mismatch, ///< mismatch operation. Associated with CIGAR Operation "M"
91  insert, ///< insertion to the reference (the query sequence contains bases that have no corresponding base in the reference). Associated with CIGAR Operation "I"
92  del, ///< deletion from the reference (the reference contains bases that have no corresponding base in the query sequence). Associated with CIGAR Operation "D"
93  skip, ///< skipped region from the reference (the reference contains bases that have no corresponding base in the query sequence). Associated with CIGAR Operation "N"
94  softClip, ///< Soft clip on the read (clipped sequence present in the query sequence, but not in reference). Associated with CIGAR Operation "S"
95  hardClip, ///< Hard clip on the read (clipped sequence not present in the query sequence or reference). Associated with CIGAR Operation "H"
96  pad ///< Padding (not in reference or query). Associated with CIGAR Operation "P"
97  };
98 
99  // The maximum value in the operation enum (used for setting up a bitset of
100  // operations.
101  static const int MAX_OP_VALUE = pad;
102 
103  ////////////////////////////////////////////////////////////////////////
104  //
105  // Nested Struct : CigarOperator
106  //
108  {
109 
110  CigarOperator()
111  {
112  operation = none;
113  count = 0;
114  }
115 
116  /// Set the cigar operator with the specified operation and
117  /// count length.
118  CigarOperator(Operation operation, uint32_t count)
119  : operation(operation), count(count) {};
120 
121  Operation operation;
122 
123  uint32_t count;
124 
125  /// Get the character code (M, I, D, N, S, H, or P) associated with
126  /// this operation.
127  char getChar() const
128  {
129  switch (operation)
130  {
131  case none:
132  return '?'; // error
133  case match:
134  case mismatch:
135  return'M';
136  case insert:
137  return 'I';
138  case del:
139  return'D';
140  case skip:
141  return 'N';
142  case softClip:
143  return 'S';
144  case hardClip:
145  return 'H';
146  case pad:
147  return 'P';
148  }
149  return '?'; // actually it is an error to get here
150  }
151 
152  /// Compare only on the operator, true if they are the same, false if not. Match and mismatch are considered the same for CIGAR strings.
153  bool operator == (const CigarOperator &rhs) const
154  {
155  if (operation==rhs.operation)
156  return true;
157  if ((operation == mismatch || operation == match) && (rhs.operation == mismatch || rhs.operation == match))
158  return true;
159  return false;
160  }
161 
162  /// Compare only on the operator, false if they are the same, true if not. Match and mismatch are considered the same for CIGAR strings.
163  bool operator != (const CigarOperator &rhs) const
164  {
165  return !((*this) == rhs) ;
166  }
167 
168  };
169 
170  ////////////////////////////////////////////////////////////////////////
171  //
172  // Cigar Class statics
173  //
174 
175  /// Return true if the specified operation is found in the
176  /// reference sequence, false if not.
177  static bool foundInReference(Operation op)
178  {
179  switch(op)
180  {
181  case match:
182  case mismatch:
183  case del:
184  case skip:
185  return true;
186  default:
187  return false;
188  }
189  return false;
190  }
191 
192  /// Return true if the specified operation is found in the
193  /// reference sequence, false if not.
194  static bool foundInReference(char op)
195  {
196  switch(op)
197  {
198  case 'M':
199  case '=':
200  case 'X':
201  case 'D':
202  case 'N':
203  return true;
204  default:
205  return false;
206  }
207  return false;
208  }
209 
210  /// Return true if the specified operation is found in the
211  /// reference sequence, false if not.
212  static bool foundInReference(const CigarOperator &op)
213  {
214  return(foundInReference(op.operation));
215  }
216 
217  /// Return true if the specified operation is found in the
218  /// query sequence, false if not.
219  static bool foundInQuery(Operation op)
220  {
221  switch(op)
222  {
223  case match:
224  case mismatch:
225  case insert:
226  case softClip:
227  return true;
228  default:
229  return false;
230  }
231  return false;
232  }
233 
234  /// Return true if the specified operation is found in the
235  /// query sequence, false if not.
236  static bool foundInQuery(char op)
237  {
238  switch(op)
239  {
240  case 'M':
241  case '=':
242  case 'X':
243  case 'I':
244  case 'S':
245  return true;
246  default:
247  return false;
248  }
249  return false;
250  }
251 
252  /// Return true if the specified operation is found in the
253  /// query sequence, false if not.
254  static bool foundInQuery(const CigarOperator &op)
255  {
256  return(foundInQuery(op.operation));
257  }
258 
259  /// Return true if the specified operation is a clipping operation,
260  /// false if not.
261  static bool isClip(Operation op)
262  {
263  switch(op)
264  {
265  case softClip:
266  case hardClip:
267  return true;
268  default:
269  return false;
270  }
271  return false;
272  }
273 
274  /// Return true if the specified operation is a clipping operation,
275  /// false if not.
276  static bool isClip(char op)
277  {
278  switch(op)
279  {
280  case 'S':
281  case 'H':
282  return true;
283  default:
284  return false;
285  }
286  return false;
287  }
288 
289  /// Return true if the specified operation is a clipping operation,
290  /// false if not.
291  static bool isClip(const CigarOperator &op)
292  {
293  return(isClip(op.operation));
294  }
295 
296  /// Return true if the specified operation is a match/mismatch operation,
297  /// false if not.
298  static bool isMatchOrMismatch(Operation op)
299  {
300  switch(op)
301  {
302  case match:
303  case mismatch:
304  return true;
305  default:
306  return false;
307  }
308  return false;
309  }
310 
311  /// Return true if the specified operation is a match/mismatch operation,
312  /// false if not.
313  static bool isMatchOrMismatch(const CigarOperator &op)
314  {
315  return(isMatchOrMismatch(op.operation));
316  }
317 
318 
319  ////////////////////////////////////////////////////////////////////////
320  //
321  // Cigar Class non static
322  //
323  friend std::ostream &operator << (std::ostream &stream, const Cigar& cigar);
324 
325  /// Default constructor initializes as a CIGAR with no operations.
327  {
328  clearQueryAndReferenceIndexes();
329  }
330 
331 
332  /// Set the passed in String to the string reprentation of the Cigar
333  /// operations in this object.
334  void getCigarString(String& cigarString) const;
335 
336  /// Set the passed in std::string to the string reprentation of the Cigar
337  /// operations in this object.
338  void getCigarString(std::string& cigarString) const;
339 
340  /// Sets the specified string to a valid CIGAR string of characters that
341  /// represent the cigar with no digits (a CIGAR of "3M" would return "MMM").
342  /// The returned string is actually also a valid CIGAR string.
343  /// In theory this makes it easier to parse some reads.
344  /// \return s the string to populate
345  void getExpandedString(std::string &s) const;
346 
347  /// Return the Cigar Operation at the specified index (starting at 0).
348  const CigarOperator & operator [](int i) const
349  {
350  return cigarOperations[i];
351  }
352 
353  /// Return the Cigar Operation at the specified index (starting at 0).
354  const CigarOperator & getOperator(int i) const
355  {
356  return cigarOperations[i];
357  }
358 
359  /// Return true if the 2 Cigars are the same
360  /// (the same operations of the same sizes).
361  bool operator == (Cigar &rhs) const;
362 
363  /// Return the number of cigar operations
364  int size() const
365  {
366  return cigarOperations.size();
367  }
368 
369  /// Write this object as a string to cout.
370  void Dump() const
371  {
372  String cigarString;
373  getCigarString(cigarString);
374  std::cout << cigarString ;
375  }
376 
377  /// Return the length of the read that corresponds to
378  /// the current CIGAR string.
379  ///
380  /// For validation, we should expect that a sequence
381  /// read in a SAM file will be the same length as the
382  /// value returned by this method.
383  ///
384  /// Example: 3M2D3M describes a read with three bases
385  /// matching the reference, then skips 2 bases, then has
386  /// three more bases that match the reference (match/mismatch).
387  /// In this case, the read length is expected to be 6.
388  ///
389  /// Example: 3M2I3M describes a read with 3 match/mismatch
390  /// bases, two extra bases, and then 3 more match/mistmatch
391  /// bases. The total in this example is 8 bases.
392  ///
393  /// \return returns the expected read length
394  int getExpectedQueryBaseCount() const;
395 
396  /// Return the number of bases in the reference that
397  /// this CIGAR "spans".
398  ///
399  /// When doing range checking, we occassionally need to know
400  /// how many total bases the CIGAR string represents as compared
401  /// to the reference.
402  ///
403  /// Examples: 3M2D3M describes a read that overlays 8 bases in
404  /// the reference. 3M2I3M describes a read with 3 bases that
405  /// match the reference, two additional bases that aren't in the
406  /// reference, and 3 more bases that match the reference, so it
407  /// spans 6 bases in the reference.
408  ///
409  /// \return how many bases in the reference are spanned
410  /// by the given CIGAR string
411  ///
412  int getExpectedReferenceBaseCount() const;
413 
414  /// Return the number of clips that are at the beginning of the cigar.
415  int getNumBeginClips() const;
416 
417  /// Return the number of clips that are at the end of the cigar.
418  int getNumEndClips() const;
419 
420  /// Return the reference offset associated with the specified
421  /// query index or INDEX_NA based on this cigar.
422  int32_t getRefOffset(int32_t queryIndex);
423 
424  /// Return the query index associated with the specified
425  /// reference offset or INDEX_NA based on this cigar.
426  int32_t getQueryIndex(int32_t refOffset);
427 
428  /// Return the reference position associated with the specified query index
429  /// or INDEX_NA based on this cigar and the specified queryStartPos which
430  /// is the leftmost mapping position of the first matching base in the
431  /// query.
432  int32_t getRefPosition(int32_t queryIndex, int32_t queryStartPos);
433 
434  /// Return the query index or INDEX_NA associated with the specified
435  /// reference offset when the query starts at the specified reference
436  /// position.
437  int32_t getQueryIndex(int32_t refPosition, int32_t queryStartPos);
438 
439  /// Returns the index into the expanded cigar for the cigar
440  /// associated with the specified queryIndex.
441  /// INDEX_NA returned if the index is out of range.
442  int32_t getExpandedCigarIndexFromQueryIndex(int32_t queryIndex);
443 
444  /// Returns the index into the expanded cigar for the cigar
445  /// associated with the specified reference offset.
446  /// INDEX_NA returned if the offset is out of range.
447  int32_t getExpandedCigarIndexFromRefOffset(int32_t refOffset);
448 
449  /// Returns the index into the expanded cigar for the cigar
450  /// associated with the specified reference position and queryStartPos.
451  /// INDEX_NA returned if the position is out of range.
452  int32_t getExpandedCigarIndexFromRefPos(int32_t refPosition,
453  int32_t queryStartPos);
454 
455  /// Return the character code of the cigar operator associated with the
456  /// specified expanded CIGAR index. '?' is returned for an out of range
457  /// index.
458  char getCigarCharOp(int32_t expandedCigarIndex);
459 
460  /// Return the character code of the cigar operator associated with
461  /// the specified queryIndex. '?' is returned for an out of range index.
462  char getCigarCharOpFromQueryIndex(int32_t queryIndex);
463 
464  /// Return the character code of the cigar operator associated with
465  /// the specified reference offset. '?' is returned for an out of range offset.
466  char getCigarCharOpFromRefOffset(int32_t refOffset);
467 
468  /// Return the character code of the cigar operator associated with
469  /// the specified reference position. '?' is returned for an out of
470  /// range reference position.
471  char getCigarCharOpFromRefPos(int32_t refPosition, int32_t queryStartPos);
472 
473  /// Return the number of bases that overlap the reference and the
474  /// read associated with this cigar that falls within the specified region.
475  /// \param start : inclusive 0-based start position (reference position) of
476  /// the region to check for overlaps in
477  /// (-1 indicates to start at the beginning of the reference.)
478  /// \param end : exclusive 0-based end position (reference position) of the
479  /// region to check for overlaps in
480  /// (-1 indicates to go to the end of the reference.)
481  /// \param queryStartPos : 0-based leftmost mapping position of the first
482  /// matcihng base in the query.
483  uint32_t getNumOverlaps(int32_t start, int32_t end, int32_t queryStartPos);
484 
485  /// Return whether or not the cigar has indels (insertions or delections)
486  /// \return true if it has an insertion or deletion, false if not.
487  bool hasIndel();
488 
489  /// Value associated with an index that is not applicable/does not exist,
490  /// used for converting between query and reference indexes/offsets when
491  /// an associated index/offset does not exist.
492  static const int32_t INDEX_NA;
493 
494 protected:
495  // Clear the query index/reference offset index vectors.
496  void clearQueryAndReferenceIndexes();
497 
498  // Set the query index/reference offset index vectors.
499  void setQueryAndReferenceIndexes();
500 
501  // Container for the cigar operations in this cigar.
502  std::vector<CigarOperator> cigarOperations;
503 
504 private:
505  // The vector is indexed by query index and contains the reference
506  // offset associated with that query index.
507  // The vector is reset each time a new cigar operation is added, and
508  // is calculated when accessed if it is not already set.
509  std::vector<int32_t> queryToRef;
510 
511  // The vector is indexed by reference offset and contains the query
512  // index associated with that reference offset.
513  // The vector is reset each time a new cigar operation is added, and
514  // is calculated when accessed if it is not already set.
515  std::vector<int32_t> refToQuery;
516 
517  // The vector is indexed by reference offset and contains the offset into
518  // the expanded cigar associated with that reference offset.
519  // The vector is reset each time a new cigar operation is added, and
520  // is calculated when accessed if it is not already set.
521  std::vector<int32_t> refToCigar;
522 
523  // The vector is indexed by query index and contains the offset into
524  // the expanded cigar associated with that query index.
525  // The vector is reset each time a new cigar operation is added, and
526  // is calculated when accessed if it is not already set.
527  std::vector<int32_t> queryToCigar;
528 
529  std::string myExpandedCigar;
530 };
531 
532 /// Writes the specified cigar operation to the specified stream as <count><char> (3M).
533 inline std::ostream &operator << (std::ostream &stream, const Cigar::CigarOperator& o)
534 {
535  stream << o.count << o.getChar();
536  return stream;
537 }
538 
539 /// Writes all of the cigar operations contained in the cigar to the passed in stream.
540 inline std::ostream &operator << (std::ostream &stream, const Cigar& cigar)
541 {
542  stream << cigar.cigarOperations;
543  return stream;
544 }
545 
546 #endif
InputFile & operator<<(InputFile &stream, const std::string &str)
Write to a file using streaming.
Definition: InputFile.h:736
This class represents the CIGAR without any methods to set the cigar (see CigarRoller for that).
Definition: Cigar.h:84
int size() const
Return the number of cigar operations.
Definition: Cigar.h:364
int getExpectedQueryBaseCount() const
Return the length of the read that corresponds to the current CIGAR string.
Definition: Cigar.cpp:95
static const int32_t INDEX_NA
Value associated with an index that is not applicable/does not exist, used for converting between que...
Definition: Cigar.h:492
char getCigarCharOp(int32_t expandedCigarIndex)
Return the character code of the cigar operator associated with the specified expanded CIGAR index.
Definition: Cigar.cpp:295
static bool foundInReference(Operation op)
Return true if the specified operation is found in the reference sequence, false if not.
Definition: Cigar.h:177
char getCigarCharOpFromQueryIndex(int32_t queryIndex)
Return the character code of the cigar operator associated with the specified queryIndex.
Definition: Cigar.cpp:314
friend std::ostream & operator<<(std::ostream &stream, const Cigar &cigar)
Writes all of the cigar operations contained in the cigar to the passed in stream.
Definition: Cigar.h:540
static bool foundInQuery(const CigarOperator &op)
Return true if the specified operation is found in the query sequence, false if not.
Definition: Cigar.h:254
Cigar()
Default constructor initializes as a CIGAR with no operations.
Definition: Cigar.h:326
int32_t getExpandedCigarIndexFromQueryIndex(int32_t queryIndex)
Returns the index into the expanded cigar for the cigar associated with the specified queryIndex.
Definition: Cigar.cpp:258
static bool foundInQuery(char op)
Return true if the specified operation is found in the query sequence, false if not.
Definition: Cigar.h:236
static bool isMatchOrMismatch(Operation op)
Return true if the specified operation is a match/mismatch operation, false if not.
Definition: Cigar.h:298
static bool foundInQuery(Operation op)
Return true if the specified operation is found in the query sequence, false if not.
Definition: Cigar.h:219
static bool foundInReference(const CigarOperator &op)
Return true if the specified operation is found in the reference sequence, false if not.
Definition: Cigar.h:212
static bool isMatchOrMismatch(const CigarOperator &op)
Return true if the specified operation is a match/mismatch operation, false if not.
Definition: Cigar.h:313
int32_t getExpandedCigarIndexFromRefPos(int32_t refPosition, int32_t queryStartPos)
Returns the index into the expanded cigar for the cigar associated with the specified reference posit...
Definition: Cigar.cpp:288
void getExpandedString(std::string &s) const
Sets the specified string to a valid CIGAR string of characters that represent the cigar with no digi...
Definition: Cigar.cpp:63
int getNumBeginClips() const
Return the number of clips that are at the beginning of the cigar.
Definition: Cigar.cpp:144
bool hasIndel()
Return whether or not the cigar has indels (insertions or delections)
Definition: Cigar.cpp:398
int getNumEndClips() const
Return the number of clips that are at the end of the cigar.
Definition: Cigar.cpp:166
char getCigarCharOpFromRefOffset(int32_t refOffset)
Return the character code of the cigar operator associated with the specified reference offset.
Definition: Cigar.cpp:320
int getExpectedReferenceBaseCount() const
Return the number of bases in the reference that this CIGAR "spans".
Definition: Cigar.cpp:120
int32_t getRefPosition(int32_t queryIndex, int32_t queryStartPos)
Return the reference position associated with the specified query index or INDEX_NA based on this cig...
Definition: Cigar.cpp:217
int32_t getExpandedCigarIndexFromRefOffset(int32_t refOffset)
Returns the index into the expanded cigar for the cigar associated with the specified reference offse...
Definition: Cigar.cpp:273
const CigarOperator & operator[](int i) const
Return the Cigar Operation at the specified index (starting at 0).
Definition: Cigar.h:348
static bool isClip(char op)
Return true if the specified operation is a clipping operation, false if not.
Definition: Cigar.h:276
Operation
Enum for the cigar operations.
Definition: Cigar.h:87
@ del
deletion from the reference (the reference contains bases that have no corresponding base in the quer...
Definition: Cigar.h:92
@ mismatch
mismatch operation. Associated with CIGAR Operation "M"
Definition: Cigar.h:90
@ hardClip
Hard clip on the read (clipped sequence not present in the query sequence or reference)....
Definition: Cigar.h:95
@ match
match/mismatch operation. Associated with CIGAR Operation "M"
Definition: Cigar.h:89
@ pad
Padding (not in reference or query). Associated with CIGAR Operation "P".
Definition: Cigar.h:96
@ insert
insertion to the reference (the query sequence contains bases that have no corresponding base in the ...
Definition: Cigar.h:91
@ skip
skipped region from the reference (the reference contains bases that have no corresponding base in th...
Definition: Cigar.h:93
@ softClip
Soft clip on the read (clipped sequence present in the query sequence, but not in reference)....
Definition: Cigar.h:94
@ none
no operation has been set.
Definition: Cigar.h:88
int32_t getRefOffset(int32_t queryIndex)
Return the reference offset associated with the specified query index or INDEX_NA based on this cigar...
Definition: Cigar.cpp:187
void Dump() const
Write this object as a string to cout.
Definition: Cigar.h:370
static bool isClip(const CigarOperator &op)
Return true if the specified operation is a clipping operation, false if not.
Definition: Cigar.h:291
static bool foundInReference(char op)
Return true if the specified operation is found in the reference sequence, false if not.
Definition: Cigar.h:194
bool operator==(Cigar &rhs) const
Return true if the 2 Cigars are the same (the same operations of the same sizes).
Definition: Cigar.cpp:80
uint32_t getNumOverlaps(int32_t start, int32_t end, int32_t queryStartPos)
Return the number of bases that overlap the reference and the read associated with this cigar that fa...
Definition: Cigar.cpp:334
char getCigarCharOpFromRefPos(int32_t refPosition, int32_t queryStartPos)
Return the character code of the cigar operator associated with the specified reference position.
Definition: Cigar.cpp:326
void getCigarString(String &cigarString) const
Set the passed in String to the string reprentation of the Cigar operations in this object.
Definition: Cigar.cpp:52
int32_t getQueryIndex(int32_t refOffset)
Return the query index associated with the specified reference offset or INDEX_NA based on this cigar...
Definition: Cigar.cpp:202
const CigarOperator & getOperator(int i) const
Return the Cigar Operation at the specified index (starting at 0).
Definition: Cigar.h:354
static bool isClip(Operation op)
Return true if the specified operation is a clipping operation, false if not.
Definition: Cigar.h:261
CigarOperator(Operation operation, uint32_t count)
Set the cigar operator with the specified operation and count length.
Definition: Cigar.h:118
bool operator!=(const CigarOperator &rhs) const
Compare only on the operator, false if they are the same, true if not. Match and mismatch are conside...
Definition: Cigar.h:163
bool operator==(const CigarOperator &rhs) const
Compare only on the operator, true if they are the same, false if not. Match and mismatch are conside...
Definition: Cigar.h:153
char getChar() const
Get the character code (M, I, D, N, S, H, or P) associated with this operation.
Definition: Cigar.h:127