libStatGen Software  1
SamRecord.h
1 /*
2  * Copyright (C) 2010-2011 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #ifndef __SAM_RECORD_H__
19 #define __SAM_RECORD_H__
20 
21 #include <stdint.h>
22 
23 #include "GenomeSequence.h"
24 #include "SamStatus.h"
25 #include "LongHash.h"
26 #include "MathVector.h"
27 #include "StringArray.h"
28 #include "IntArray.h"
29 #include "SamFileHeader.h"
30 #include "CigarRoller.h"
31 
32 /// Structure of a BAM record.
34 {
35 public:
36  int32_t myBlockSize;
37  int32_t myReferenceID;
38  int32_t myPosition;
39  uint32_t myReadNameLength : 8, myMapQuality : 8, myBin : 16;
40  uint32_t myCigarLength : 16, myFlag : 16;
41  int32_t myReadLength;
42  int32_t myMateReferenceID;
43  int32_t myMatePosition;
44  int32_t myInsertSize; // Outer fragment length
45  char myData[1];
46 };
47 
48 
49 /// Class providing an easy to use interface to get/set/operate on the
50 /// fields in a SAM/BAM record.
51 class SamRecord
52 {
53 public:
54  /// Enum containing the settings on how to translate the sequence if a
55  /// reference is available. If no reference is available, no translation
56  /// is done.
58  NONE, ///< Leave the sequence as is.
59  EQUAL, ///< Translate bases that match the reference to '='
60  BASES, ///< Translate '=' to the actual base.
61  };
62 
63  /// Default Constructor.
64  SamRecord();
65 
66  /// Constructor that sets the error handling type.
67  /// \param errorHandlingType how to handle errors.
68  SamRecord(ErrorHandler::HandlingType errorHandlingType);
69 
70  /// Destructor
71  ~SamRecord();
72 
73  /// Reset the fields of the record to a default value.
74  /// This is not necessary when you are reading a SAM/BAM file,
75  /// but if you are setting fields, it is a good idea to clean
76  /// out a record before reusing it. Clearing it allows you to
77  /// not have to set any empty fields.
78  void resetRecord();
79 
80  /// Returns whether or not the record is valid, setting the status to
81  /// indicate success or failure.
82  /// \param header SAM Header associated with the record. Used to perform
83  /// some validation against the header.
84  /// \return true if the record is valid, false if not.
85  bool isValid(SamFileHeader& header);
86 
87  /// Set the reference to the specified genome sequence object.
88  /// \param reference pointer to the GenomeSequence object.
89  void setReference(GenomeSequence* reference);
90 
91  /// Set the type of sequence translation to use when getting
92  /// the sequence. The default type (if this method is never called) is
93  /// NONE (the sequence is left as-is). Can be over-ridden by using
94  /// the accessors that take a SequenceTranslation parameter.
95  /// \param translation type of sequence translation to use.
97 
98  ///////////////////////
99  /// @name Set Alignment Data
100  /// Set methods for record fields. All of the "set" methods set the
101  /// status to indicate success or the failure reason.
102  //@{
103 
104  /// Set QNAME to the passed in name.
105  /// \param readName the readname to set the QNAME to.
106  /// \return true if successfully set, false if not.
107  bool setReadName(const char* readName);
108 
109  /// Set the bitwise FLAG to the specified value.
110  /// \param flag integer flag to use.
111  /// \return true if successfully set, false if not.
112  bool setFlag(uint16_t flag);
113 
114  /// Set the reference sequence name (RNAME) to the specified name, using
115  /// the header to determine the reference id.
116  /// \param header SAM/BAM header to use to determine the reference id.
117  /// \param referenceName reference name to use.
118  /// \return true if successfully set, false if not
119  bool setReferenceName(SamFileHeader& header,
120  const char* referenceName);
121 
122  /// Set the leftmost position (POS) using the specified 1-based (SAM format)
123  /// value.
124  /// Internal processing handles the switching between SAM/BAM formats
125  /// when read/written.
126  /// \param position 1-based start position
127  /// \return true if successfully set, false if not.
128  bool set1BasedPosition(int32_t position);
129 
130  /// Set the leftmost position using the specified 0-based (BAM format)
131  /// value.
132  /// Internal processing handles the switching between SAM/BAM formats
133  /// when read/written.
134  /// \param position 0-based start position
135  /// \return true if successfully set, false if not.
136  bool set0BasedPosition(int32_t position);
137 
138  /// Set the mapping quality (MAPQ).
139  /// \param mapQuality map quality to set in the record.
140  /// \return true if successfully set, false if not.
141  bool setMapQuality(uint8_t mapQuality);
142 
143  /// Set the CIGAR to the specified SAM formatted cigar string.
144  /// Internal processing handles the switching between SAM/BAM formats
145  /// when read/written.
146  /// \param cigar string containing the SAM formatted cigar.
147  /// \return true if successfully set, false if not.
148  bool setCigar(const char* cigar);
149 
150  /// Set the CIGAR to the specified Cigar object.
151  /// Internal processing handles the switching between SAM/BAM formats
152  /// when read/written.
153  /// \param cigar object to set this record's cigar to have.
154  /// \return true if successfully set, false if not.
155  bool setCigar(const Cigar& cigar);
156 
157 
158  /// Set the mate/next fragment's reference sequence name (RNEXT) to the
159  /// specified name, using the header to determine the mate reference id.
160  /// \param header SAM/BAM header to use to determine the mate reference id.
161  /// \param referenceName mate reference name to use.
162  /// \return true if successfully set, false if not
163  bool setMateReferenceName(SamFileHeader& header,
164  const char* mateReferenceName);
165 
166  /// Set the mate/next fragment's leftmost position (PNEXT) using the
167  /// specified 1-based (SAM format) value.
168  /// Internal processing handles the switching between SAM/BAM formats
169  /// when read/written.
170  /// \param position 1-based start position
171  /// \return true if successfully set, false if not.
172  bool set1BasedMatePosition(int32_t matePosition);
173 
174  /// Set the mate/next fragment's leftmost position using the specified
175  /// 0-based (BAM format) value.
176  /// Internal processing handles the switching between SAM/BAM formats
177  /// when read/written.
178  /// \param position 0-based start position
179  /// \return true if successfully set, false if not.
180  bool set0BasedMatePosition(int32_t matePosition);
181 
182  /// Sets the inferred insert size (ISIZE)/observed template length (TLEN).
183  /// \param insertSize inferred insert size/observed template length.
184  /// \return true if successfully set, false if not.
185  bool setInsertSize(int32_t insertSize);
186 
187  /// Sets the sequence (SEQ) to the specified SAM formatted sequence string.
188  /// Internal processing handles switching between SAM/BAM formats when
189  /// read/written.
190  /// \param seq SAM sequence string. May contain '='.
191  /// \return true if successfully set, false if not.
192  bool setSequence(const char* seq);
193 
194  /// Sets the quality (QUAL) to the specified SAM formatted quality string.
195  /// Internal processing handles switching between SAM/BAM formats when
196  /// read/written.
197  /// \param quality SAM quality string.
198  /// \return true if successfully set, false if not.
199  bool setQuality(const char* quality);
200 
201  /// Shift the indels (if any) to the left by updating the CIGAR.
202  /// \return true if the cigar was shifted, false if not.
203  bool shiftIndelsLeft();
204 
205  /// Sets the SamRecord to contain the information in the BAM formatted
206  /// fromBuffer.
207  /// \param fromBuffer buffer to read the BAM record from.
208  /// \param fromBufferSize size of the buffer containing the BAM record.
209  /// \param header BAM header for the record.
210  /// \return status of reading the BAM record from the buffer.
211  SamStatus::Status setBuffer(const char* fromBuffer, uint32_t fromBufferSize,
212  SamFileHeader& header);
213 
214  /// Read the BAM record from a file.
215  /// \param filePtr file to read the buffer from.
216  /// \param header BAM header for the record.
217  /// \return status of the reading the BAM record from the file.
219 
220  //@}
221 
222  ///////////////////////
223  /// @name Set Tag Data
224  /// Set methods for tags.
225  //@{
226 
227  /// Add the specified integer tag to the record. Internal processing
228  /// handles switching between SAM/BAM formats when read/written and
229  /// determining the type for BAM format. If the tag is already there
230  /// this code will replace it if the specified value is different.
231  /// \param tag two character tag to be added to the SAM/BAM record.
232  /// \param value value for the specified tag.
233  /// \return true if the tag was successfully added, false otherwise.
234  bool addIntTag(const char* tag, int32_t value);
235 
236  /// Add the specified tag,vtype,value to the record. Vtype can be SAM/BAM
237  /// format. Internal processing handles switching between SAM/BAM formats
238  /// when read/written. If the tag is already there this code will replace
239  /// it if the specified value is different.
240  /// \param tag two character tag to be added to the SAM/BAM record.
241  /// \param vtype vtype of the specified value - either SAM/BAM vtypes.
242  /// \param value value as a string for the specified tag.
243  /// \return true if the tag was successfully added, false otherwise.
244  bool addTag(const char* tag, char vtype, const char* value);
245 
246  /// Clear the tags in this record.
247  /// Does not set SamStatus.
248  void clearTags();
249 
250  /// Remove a tag.
251  /// \param tag tag to remove.
252  /// \param type of the tag to be removed.
253  /// \return true if the tag no longer exists in the record, false if it could not be removed (Returns true if the tag was not found in the record).
254  bool rmTag(const char* tag, char type);
255 
256  /// Remove tags.
257  /// The delimiter between the tags is ',' or ';'. ',' was added since
258  /// the original delimiter, ';', requires the string to be quoted on the
259  /// command-line.
260  /// \param tags tags to remove, formatted as Tag:Type,Tag:Type,Tag:Type...
261  /// \return true if all tags no longer exist in the record, false if any could not be removed
262  /// (Returns true if the tags were not found in the record).
263  /// SamStatus is set to INVALID if the tags are incorrectly formatted.
264  bool rmTags(const char* tags);
265 
266  //@}
267 
268  ///////////////////////
269  /// @name Get Alignment Data
270  /// Get methods for record fields. All of the "get" methods set the
271  /// status to indicate success or the failure reason.
272  //@{
273 
274  /// Get a const pointer to the buffer that contains the BAM representation
275  /// of the record.
276  /// \return const pointer to the buffer that contains the BAM representation
277  /// of the record.
278  const void* getRecordBuffer();
279 
280  /// Get a const pointer to the buffer that contains the BAM representation
281  /// of the record using the specified translation on the sequence.
282  /// \param translation type of sequence translation to use.
283  /// \return const pointer to the buffer that contains the BAM representation
284  /// of the record.
285  const void* getRecordBuffer(SequenceTranslation translation);
286 
287  /// Write the record as a BAM into the specified already opened file.
288  /// \param filePtr file to write the BAM record into.
289  /// \return status of the write.
291 
292  /// Write the record as a BAM into the specified already opened file using
293  /// the specified translation on the sequence.
294  /// \param filePtr file to write the BAM record into.
295  /// \param translation type of sequence translation to use.
296  /// \return status of the write.
298  SequenceTranslation translation);
299 
300  /// Get the block size of the record (BAM format).
301  /// \return BAM block size of the record.
302  int32_t getBlockSize();
303 
304  /// Get the reference sequence name (RNAME) of the record.
305  /// \return reference sequence name
306  const char* getReferenceName();
307 
308  /// Get the reference sequence id of the record (BAM format rid).
309  /// \return reference sequence id
310  int32_t getReferenceID();
311 
312  /// Get the 1-based(SAM) leftmost position (POS) of the record.
313  /// \return 1-based leftmost position.
314  int32_t get1BasedPosition();
315 
316  /// Get the 0-based(BAM) leftmost position of the record.
317  /// \return 0-based leftmost position.
318  int32_t get0BasedPosition();
319 
320  /// Get the length of the readname (QNAME) including the null.
321  /// \return length of the read name (including null).
322  uint8_t getReadNameLength();
323 
324  /// Get the mapping quality (MAPQ) of the record.
325  /// \return map quality.
326  uint8_t getMapQuality();
327 
328  /// Get the BAM bin for the record.
329  /// \return BAM bin
330  uint16_t getBin();
331 
332  /// Get the length of the BAM formatted CIGAR.
333  /// \return length of BAM formatted cigar.
334  uint16_t getCigarLength();
335 
336  /// Get the flag (FLAG).
337  /// \return flag.
338  uint16_t getFlag();
339 
340  /// Get the length of the read.
341  /// \return read length.
342  int32_t getReadLength();
343 
344  /// Get the mate/next fragment's reference sequence name (RNEXT). If it
345  /// is equal to the reference name, it still returns the reference name.
346  /// \return reference sequence name
347  const char* getMateReferenceName();
348 
349  /// Get the mate/next fragment's reference sequence name (RNEXT),
350  /// returning "=" if it is the same as the reference name, unless
351  /// they are both "*" in which case "*" is returned.
352  /// \return reference sequence name or '='
353  const char* getMateReferenceNameOrEqual();
354 
355  /// Get the mate reference id of the record
356  /// (BAM format: mate_rid/next_refID).
357  /// \return reference id
358  int32_t getMateReferenceID();
359 
360  /// Get the 1-based(SAM) leftmost mate/next fragment's position (PNEXT).
361  /// \return 1-based leftmost position.
362  int32_t get1BasedMatePosition();
363 
364  /// Get the 0-based(BAM) leftmost mate/next fragment's position.
365  /// \return 0-based leftmost position.
366  int32_t get0BasedMatePosition();
367 
368  /// Get the inferred insert size of the read pair (ISIZE) or
369  /// observed template length (TLEN).
370  /// \return inferred insert size or observed template length.
371  int32_t getInsertSize();
372 
373  /// Returns the 0-based inclusive rightmost position of the
374  /// clipped sequence.
375  /// \return 0-based inclusive rightmost position
376  int32_t get0BasedAlignmentEnd();
377 
378  /// Returns the 1-based inclusive rightmost position of the
379  /// clipped sequence.
380  /// \return 1-based inclusive rightmost position
381  int32_t get1BasedAlignmentEnd();
382 
383  /// Returns the length of the clipped sequence, returning 0 if the cigar
384  /// is '*'.
385  /// \return length of the clipped sequence.
386  int32_t getAlignmentLength();
387 
388  /// Returns the 0-based inclusive left-most position adjusted for
389  /// clipped bases.
390  /// \return 0-based inclusive leftmost position including clips.
391  int32_t get0BasedUnclippedStart();
392 
393  /// Returns the 1-based inclusive left-most position adjusted for
394  /// clipped bases.
395  /// \return 1-based inclusive leftmost position including clips.
396  int32_t get1BasedUnclippedStart();
397 
398  /// Returns the 0-based inclusive right-most position adjusted for
399  /// clipped bases.
400  /// \return 0-based inclusive rightmost position including clips.
401  int32_t get0BasedUnclippedEnd();
402 
403  /// Returns the 1-based inclusive right-most position adjusted for
404  /// clipped bases.
405  /// \return 1-based inclusive rightmost position including clips.
406  int32_t get1BasedUnclippedEnd();
407 
408  /// Returns the SAM formatted Read Name (QNAME).
409  /// \return read name.
410  const char* getReadName();
411 
412  /// Returns the SAM formatted CIGAR string.
413  /// \return cigar string.
414  const char* getCigar();
415 
416  /// Returns the SAM formatted sequence string (SEQ), translating the base as
417  /// specified by setSequenceTranslation.
418  /// \return sequence string.
419  const char* getSequence();
420 
421  /// Returns the SAM formatted sequence string (SEQ) performing the specified
422  /// sequence translation.
423  /// \param translation type of sequence translation to use.
424  /// \return sequence string.
425  const char* getSequence(SequenceTranslation translation);
426 
427  /// Returns the SAM formatted quality string (QUAL).
428  /// \return quality string.
429  const char* getQuality();
430 
431  /// Get the sequence base at the specified index into this sequence 0 to
432  /// readLength - 1, translating the base as specified by
433  /// setSequenceTranslation. Throws an exception if index is out of range.
434  /// \param index index into the sequence string (0 to readLength-1).
435  /// \return the sequence base at the specified index into the sequence.
436  char getSequence(int index);
437 
438  /// Get the sequence base at the specified index into this sequence 0 to
439  /// readLength - 1 performing the specified sequence translation.
440  /// Throws an exception if index is out of range.
441  /// \param index index into the sequence string (0 to readLength-1).
442  /// \param translation type of sequence translation to use.
443  /// \return the sequence base at the specified index into the sequence.
444  char getSequence(int index, SequenceTranslation translation);
445 
446  /// Get the quality character at the specified index into the quality 0 to
447  /// readLength - 1. Throws an exception if index is out of range.
448  /// \param index index into the quality string (0 to readLength-1).
449  /// \return the quality character at the specified index into the quality.
450  char getQuality(int index);
451 
452  /// Returns a pointer to the Cigar object associated with this record.
453  /// The object is essentially read-only, only allowing modifications
454  /// due to lazy evaluations.
455  /// \return pointer to the Cigar object.
456  Cigar* getCigarInfo();
457 
458  /// Return the number of bases in this read that overlap the passed in
459  /// region. Matches & mismatches between the read and the reference
460  /// are counted as overlaps, but insertions, deletions, skips, clips, and
461  /// pads are not counted.
462  /// \param start inclusive 0-based start position (reference position) of
463  /// the region to check for overlaps in.
464  /// (-1 indicates to start at the beginning of the reference.)
465  /// \param end exclusive 0-based end position (reference position) of the
466  /// region to check for overlaps in.
467  /// (-1 indicates to go to the end of the reference.)
468  /// \return number of overlapping bases
469  uint32_t getNumOverlaps(int32_t start, int32_t end);
470 
471  /// Returns the values of all fields except the tags.
472  /// \param recStruct structure containing the contents of all
473  /// non-variable length fields.
474  /// \param readName read name from the record (return param)
475  /// \param cigar cigar string from the record (return param)
476  /// \param sequence sequence string from the record (return param)
477  /// \param quality quality string from the record (return param)
478  /// \return true if all fields were successfully set, false otherwise.
479  bool getFields(bamRecordStruct& recStruct, String& readName,
480  String& cigar, String& sequence, String& quality);
481 
482  /// Returns the values of all fields except the tags using the specified
483  /// sequence translation.
484  /// \param recStruct structure containing the contents of all
485  /// non-variable length fields.
486  /// \param readName read name from the record (return param)
487  /// \param cigar cigar string from the record (return param)
488  /// \param sequence sequence string from the record (return param)
489  /// \param quality quality string from the record (return param)
490  /// \param translation type of sequence translation to use.
491  /// \return true if all fields were successfully set, false otherwise.
492  bool getFields(bamRecordStruct& recStruct, String& readName,
493  String& cigar, String& sequence, String& quality,
494  SequenceTranslation translation);
495 
496  /// Returns a pointer to the genome sequence object associated with this
497  /// record if it was set (NULL if it was not set).
498  /// \return pointer to the GenomeSequence object or NULL if there isn't one.
500 
501  //@}
502 
503  ///////////////////////
504  /// @name Get Tag Methods
505  /// Get methods for obtaining information on tags.
506  //@{
507 
508  /// Returns the length of the BAM formatted tags.
509  /// \return length of the BAM formatted tags.
510  uint32_t getTagLength();
511 
512  /// Get the next tag from the record.
513  /// Sets the Status to SUCCESS when a tag is successfully returned or
514  /// when there are no more tags. Otherwise the status is set to describe
515  /// why it failed (parsing, etc).
516  /// \param tag set to the tag when a tag is read.
517  /// \param vtype set to the vtype when a tag is read.
518  /// \param value pointer to the value of the tag (will need to cast
519  /// to int, float, char, or string based on vtype).
520  /// \return true if a tag was read, false if there are no more tags.
521  bool getNextSamTag(char* tag, char& vtype, void** value);
522 
523  /// Reset the tag iterator to the beginning of the tags.
524  void resetTagIter();
525 
526  /// Returns whether or not the specified vtype is an integer type.
527  /// Does not set SamStatus.
528  /// \param vtype value type to check.
529  /// \return true if the passed in vtype is an integer ('c', 'C', 's',
530  /// 'S', 'i', 'I'), false otherwise.
531  static bool isIntegerType(char vtype);
532 
533  /// Returns whether or not the specified vtype is a float type.
534  /// Does not set SamStatus.
535  /// \param vtype value type to check.
536  /// \return true if the passed in vtype is a float ('f'), false otherwise.
537  static bool isFloatType(char vtype);
538 
539  /// Returns whether or not the specified vtype is a char type.
540  /// Does not set SamStatus.
541  /// \param vtype value type to check.
542  /// \return true if the passed in vtype is a char ('A'), false otherwise.
543  static bool isCharType(char vtype);
544 
545  /// Returns whether or not the specified vtype is a string type.
546  /// Does not set SamStatus.
547  /// \param vtype value type to check.
548  /// \return true if the passed in vtype is a string ('Z'/'B'), false othwerise.
549  static bool isStringType(char vtype);
550 
551  /// Get the string representation of the tags from the record, formatted
552  /// as TAG:TYPE:VALUE<delim>TAG:TYPE:VALUE...
553  /// Sets the Status to SUCCESS when the tags are successfully returned or
554  /// the tags were not found. If a different error occured, the status is
555  /// set appropriately.
556  /// The delimiter between the tags to retrieve is ',' or ';'. ',' was added
557  /// since the original delimiter, ';', requires the string to be quoted on
558  /// the command-line.
559  /// \param tags the tags to retrieve, formatted as TAG:TYPE,TAG:TYPE...
560  /// \param returnString the String to set (this method first clears returnString)
561  /// to TAG:TYPE:VALUE<delim>TAG:TYPE:VALUE...
562  /// \param delim delimiter to use to separate two tags, default is a tab.
563  /// \return true if there were not any errors even if no tags were found.
564  bool getTagsString(const char* tags, String& returnString, char delim = '\t');
565 
566  /// Get the string value for the specified tag.
567  /// \param tag tag to retrieve
568  /// \param pointer to the tag's string value if found, NULL if not found.
569  const String* getStringTag(const char * tag);
570 
571  /// Get the integer value for the specified tag, DEPRECATED, use one that returns a bool (success/failure).
572  /// \param tag tag to retrieve
573  /// \retun pointer to the tag's integer value if found, NULL if not found.
574  int* getIntegerTag(const char * tag);
575 
576  /// Get the integer value for the specified tag.
577  /// \param tag tag to retrieve
578  /// \param tagVal return parameter with integer value for the tag
579  /// \retun bool true if Integer tag was found and tagVal was set,
580  /// false if not.
581  bool getIntegerTag(const char * tag, int& tagVal);
582 
583  /// Get the float value for the specified tag.
584  /// \param tag tag to retrieve
585  /// \param tagVal return parameter with integer value for the tag
586  /// \return bool true if Float tag was found and tagVal was set,
587  /// false if not.
588  bool getFloatTag(const char * tag, float& tagVal);
589 
590  /// Get the string value for the specified tag.
591  const String & getString(const char * tag);
592 
593  /// Get the integer value for the specified tag, DEPRECATED, use getIntegerTag that returns a bool.
594  int & getInteger(const char * tag);
595 
596  /// Check if the specified tag contains a string.
597  /// Does not set SamStatus.
598  /// \param tag SAM tag to check contents of.
599  /// \return true if the value associated with the tag is a string.
600  bool checkString(const char * tag)
601  { return(checkTag(tag, 'Z') || checkTag(tag, 'B')); }
602 
603  /// Check if the specified tag contains an integer.
604  /// Does not set SamStatus.
605  /// \param tag SAM tag to check contents of.
606  /// \return true if the value associated with the tag is a string.
607  bool checkInteger(const char * tag) { return checkTag(tag, 'i'); }
608 
609  /// Check if the specified tag contains a string.
610  /// Does not set SamStatus.
611  /// \param tag SAM tag to check contents of.
612  /// \return true if the value associated with the tag is a string.
613  bool checkFloat(const char * tag) { return checkTag(tag, 'f'); }
614 
615  /// Check if the specified tag contains a value of the specified vtype.
616  /// Does not set SamStatus.
617  /// \param tag SAM tag to check contents of.
618  /// \param type value type to check if the SAM tag matches.
619  /// \return true if the value associated with the tag is a string.
620  bool checkTag(const char * tag, char type);
621  //@}
622 
623  /// Returns the status associated with the last method that sets the status.
624  /// \return SamStatus of the last command that sets status.
625  const SamStatus& getStatus();
626 
627 
628 private:
629  static int MAKEKEY(char ch1, char ch2, char type)
630  { return (getKeyType(type) << 16) + (ch2 << 8) + ch1; }
631 
632  static char getKeyType(char type)
633  {
634  switch(type)
635  {
636  // For any char/integer type, return 'i'
637  case 'A' :
638  case 'c' :
639  case 'C' :
640  case 's' :
641  case 'S' :
642  case 'i' :
643  case 'I' :
644  return('i');
645  break;
646  default:
647  // For all other types, return the actual type.
648  return(type);
649  };
650  }
651 
652  static inline int getNumericTagTypeSize(char type)
653  {
654  switch(type)
655  {
656  case 'A':
657  case 'c':
658  case 'C':
659  return(1);
660  break;
661  case 's':
662  case 'S':
663  return(2);
664  break;
665  case 'i':
666  case 'I':
667  case 'f':
668  return(4);
669  default:
670  // Not a numeric type.
671  return(0);
672  }
673  }
674 
675  // Allocate space for the record - does a realloc.
676  // The passed in size is the size of the entire record including the
677  // block size field.
678  // Adds any errors to myStatus.
679  bool allocateRecordStructure(int size);
680 
681  void* getStringPtr(int offset);
682  void* getIntegerPtr(int offset, char& vtype);
683  void* getFloatPtr(int offset);
684 
685  // Fixes the buffer to match the variable length fields.
686  // Adds any errors to myStatus.
687  bool fixBuffer(SequenceTranslation translation);
688 
689  // Sets the Sequence and Quality strings from the buffer.
690  // They are done together in one method because they require the same
691  // loop, so might as well be done at the same time.
692  // Adds any errors to myStatus.
693  void setSequenceAndQualityFromBuffer();
694 
695  // Parse the cigar to calculate the alignment/unclipped ends and convert
696  // to SAM/BAM format.
697  // Adds any errors to myStatus.
698  bool parseCigar();
699  // Parse the cigar string to calculate the cigar length and alignment end
700  // and convert to SAM format.
701  // Adds any errors to myStatus.
702  bool parseCigarBinary();
703  // Parse the cigar string to calculate the cigar length and alignment end
704  // and convert to BAM format.
705  // Adds any errors to myStatus.
706  bool parseCigarString();
707 
708  // Set the tags from the buffer.
709  // Adds any errors to myStatus.
710  bool setTagsFromBuffer();
711 
712  // Set the tags in the buffer.
713  // Adds any errors to myStatus.
714  bool setTagsInBuffer();
715 
716  void setVariablesForNewBuffer(SamFileHeader& header);
717 
718  void getTypeFromKey(int key, char& type) const;
719  void getTag(int key, char* tag) const;
720 
721  String & getString(int offset);
722  int & getInteger(int offset);
723  const char & getIntegerType(int offset) const;
724  float & getFloat(int offset);
725 
726  // Append the string representation of the value at the specified index
727  // of the int array.
728  inline void appendIntArrayValue(int index, String& strVal) const
729  {
730  appendIntArrayValue(intType[index], integers[index], strVal);
731  }
732 
733  void appendIntArrayValue(char type, int value, String& strVal) const;
734 
735  int getBtagBufferSize(String& tagStr);
736  int setBtagBuffer(String& tagStr, char* extraPtr);
737  int getStringFromBtagBuffer(unsigned char* buffer, String& tagStr);
738 
739  static const int DEFAULT_BLOCK_SIZE = 40;
740  static const int DEFAULT_BIN = 4680;
741  static const int DEFAULT_READ_NAME_LENGTH = 8;
742  static const char* DEFAULT_READ_NAME;
743  static const char* FIELD_ABSENT_STRING;
744 
745  bamRecordStruct * myRecordPtr;
746  int allocatedSize;
747 
748  // Pointer to a temporary cigar buffer that can be used during string
749  // parsing before it is ready to be copied into the actual record.
750  uint32_t* myCigarTempBuffer;
751 
752  // Size of the currently allocated temporary cigar buffer.
753  int myCigarTempBufferAllocatedSize;
754 
755  // Length of the cigar currently contained in the temporary buffer.
756  int myCigarTempBufferLength;
757 
758  // Track if the buffer is in sync with the Strings/Tags.
759  // Set to false if any of the variable length fields are modified.
760  // Set to true when the buffer is updated to match the variable length
761  // fields.
762  bool myIsBufferSynced;
763 
764  // Track if the tags need to be set from the buffer.
765  bool myNeedToSetTagsFromBuffer;
766 
767  // Trag if the tags need to be set in the buffer.
768  // Allows you to set just the tags if they are the only thing that changed
769  // in the buffer.
770  bool myNeedToSetTagsInBuffer;
771 
772  int myTagBufferSize;
773  int myLastTagIndex;
774 
775  String myReadName;
776  String myReferenceName;
777  String myMateReferenceName;
778  String myCigar;
779  String mySequence;
780  String myQuality;
781 
782  std::string mySeqWithEq;
783  std::string mySeqWithoutEq;
784 
785  // The length of the alignment.
786  int32_t myAlignmentLength;
787  // Unclipped alignment positions.
788  int32_t myUnclippedStartOffset;
789  int32_t myUnclippedEndOffset;
790 
791  CigarRoller myCigarRoller;
792 
793  LongHash<int> extras;
794  // Note: not all values in strings, integers, and floats are always
795  // in extras. They will not be if the tags were removed. Removed
796  // tags are removed from extras, but not from strings, integers, or floats
797  // since if one was removed from these arrays, all other entries would
798  // need their indices updated in extras.
799  StringArray strings;
800  IntArray integers;
801  std::vector<char> intType; // contains the type of int at same position in integers.
802  std::vector<float> floats;
803 
804 
805  // Track whether or not the buffer values are correct for
806  // each setting.
807  bool myIsReadNameBufferValid;
808  bool myIsCigarBufferValid;
809  bool myIsSequenceBufferValid;
810  bool myIsQualityBufferValid;
811  bool myIsTagsBufferValid;
812  bool myIsBinValid;
813 
814  unsigned char* myPackedSequence;
815  unsigned char* myPackedQuality;
816 
817 
818  SamStatus myStatus;
819 
820  // The current translation of the sequence as it occurs in the buffer.
821  // Only applicable if myIsSequenceBufferValid == true.
822  SequenceTranslation myBufferSequenceTranslation;
823 
824 
825  // Track the Reference.
826  GenomeSequence* myRefPtr;
827 
828  // The type of translation to do when getting a sequence.
829  SequenceTranslation mySequenceTranslation;
830 
831  String NOT_FOUND_TAG_STRING;
832  int NOT_FOUND_TAG_INT;
833 
834  static const int myMaxWarns = 5;
835  static int myNumWarns;
836 };
837 
838 #endif
The purpose of this class is to provide accessors for setting, updating, modifying the CIGAR object....
Definition: CigarRoller.h:67
This class represents the CIGAR without any methods to set the cigar (see CigarRoller for that).
Definition: Cigar.h:84
HandlingType
This specifies how this class should respond to errors.
Definition: ErrorHandler.h:29
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
Class for easily reading/writing files without having to worry about file type (uncompressed,...
Definition: InputFile.h:37
This class allows a user to get/set the fields in a SAM/BAM Header.
Definition: SamFileHeader.h:35
Class providing an easy to use interface to get/set/operate on the fields in a SAM/BAM record.
Definition: SamRecord.h:52
int32_t getBlockSize()
Get the block size of the record (BAM format).
Definition: SamRecord.cpp:1281
uint16_t getCigarLength()
Get the length of the BAM formatted CIGAR.
Definition: SamRecord.cpp:1362
const char * getReferenceName()
Get the reference sequence name (RNAME) of the record.
Definition: SamRecord.cpp:1298
SequenceTranslation
Enum containing the settings on how to translate the sequence if a reference is available.
Definition: SamRecord.h:57
@ NONE
Leave the sequence as is.
Definition: SamRecord.h:58
@ BASES
Translate '=' to the actual base.
Definition: SamRecord.h:60
@ EQUAL
Translate bases that match the reference to '='.
Definition: SamRecord.h:59
bool setReadName(const char *readName)
Set QNAME to the passed in name.
Definition: SamRecord.cpp:193
int32_t getInsertSize()
Get the inferred insert size of the read pair (ISIZE) or observed template length (TLEN).
Definition: SamRecord.cpp:1459
bool checkString(const char *tag)
Check if the specified tag contains a string.
Definition: SamRecord.h:600
int32_t get0BasedMatePosition()
Get the 0-based(BAM) leftmost mate/next fragment's position.
Definition: SamRecord.cpp:1452
int32_t get1BasedPosition()
Get the 1-based(SAM) leftmost position (POS) of the record.
Definition: SamRecord.cpp:1312
void clearTags()
Clear the tags in this record.
Definition: SamRecord.cpp:977
bool addIntTag(const char *tag, int32_t value)
Add the specified integer tag to the record.
Definition: SamRecord.cpp:647
int32_t getReferenceID()
Get the reference sequence id of the record (BAM format rid).
Definition: SamRecord.cpp:1305
bool getTagsString(const char *tags, String &returnString, char delim='\t')
Get the string representation of the tags from the record, formatted as TAG:TYPE:VALUE<delim>TAG:TYPE...
Definition: SamRecord.cpp:2082
GenomeSequence * getReference()
Returns a pointer to the genome sequence object associated with this record if it was set (NULL if it...
Definition: SamRecord.cpp:1923
int32_t getAlignmentLength()
Returns the length of the clipped sequence, returning 0 if the cigar is '*'.
Definition: SamRecord.cpp:1493
int & getInteger(const char *tag)
Get the integer value for the specified tag, DEPRECATED, use getIntegerTag that returns a bool.
Definition: SamRecord.cpp:2350
bool setInsertSize(int32_t insertSize)
Sets the inferred insert size (ISIZE)/observed template length (TLEN).
Definition: SamRecord.cpp:336
int32_t get1BasedAlignmentEnd()
Returns the 1-based inclusive rightmost position of the clipped sequence.
Definition: SamRecord.cpp:1486
uint32_t getTagLength()
Returns the length of the BAM formatted tags.
Definition: SamRecord.cpp:1929
SamRecord()
Default Constructor.
Definition: SamRecord.cpp:34
static bool isIntegerType(char vtype)
Returns whether or not the specified vtype is an integer type.
Definition: SamRecord.cpp:2040
bool rmTag(const char *tag, char type)
Remove a tag.
Definition: SamRecord.cpp:992
bool setMateReferenceName(SamFileHeader &header, const char *mateReferenceName)
Set the mate/next fragment's reference sequence name (RNEXT) to the specified name,...
Definition: SamRecord.cpp:297
uint8_t getReadNameLength()
Get the length of the readname (QNAME) including the null.
Definition: SamRecord.cpp:1326
bool checkFloat(const char *tag)
Check if the specified tag contains a string.
Definition: SamRecord.h:613
Cigar * getCigarInfo()
Returns a pointer to the Cigar object associated with this record.
Definition: SamRecord.cpp:1836
bool getFloatTag(const char *tag, float &tagVal)
Get the float value for the specified tag.
Definition: SamRecord.cpp:2281
SamStatus::Status writeRecordBuffer(IFILE filePtr)
Write the record as a BAM into the specified already opened file.
Definition: SamRecord.cpp:1237
const char * getMateReferenceNameOrEqual()
Get the mate/next fragment's reference sequence name (RNEXT), returning "=" if it is the same as the ...
Definition: SamRecord.cpp:1420
bool setMapQuality(uint8_t mapQuality)
Set the mapping quality (MAPQ).
Definition: SamRecord.cpp:251
static bool isFloatType(char vtype)
Returns whether or not the specified vtype is a float type.
Definition: SamRecord.cpp:2052
SamStatus::Status setBuffer(const char *fromBuffer, uint32_t fromBufferSize, SamFileHeader &header)
Sets the SamRecord to contain the information in the BAM formatted fromBuffer.
Definition: SamRecord.cpp:525
int32_t get1BasedUnclippedStart()
Returns the 1-based inclusive left-most position adjusted for clipped bases.
Definition: SamRecord.cpp:1519
bool addTag(const char *tag, char vtype, const char *value)
Add the specified tag,vtype,value to the record.
Definition: SamRecord.cpp:791
uint16_t getBin()
Get the BAM bin for the record.
Definition: SamRecord.cpp:1347
bool isValid(SamFileHeader &header)
Returns whether or not the record is valid, setting the status to indicate success or failure.
Definition: SamRecord.cpp:161
int32_t getMateReferenceID()
Get the mate reference id of the record (BAM format: mate_rid/next_refID).
Definition: SamRecord.cpp:1438
bool getFields(bamRecordStruct &recStruct, String &readName, String &cigar, String &sequence, String &quality)
Returns the values of all fields except the tags.
Definition: SamRecord.cpp:1866
bool set0BasedMatePosition(int32_t matePosition)
Set the mate/next fragment's leftmost position using the specified 0-based (BAM format) value.
Definition: SamRecord.cpp:328
void resetRecord()
Reset the fields of the record to a default value.
Definition: SamRecord.cpp:91
bool setFlag(uint16_t flag)
Set the bitwise FLAG to the specified value.
Definition: SamRecord.cpp:215
bool set1BasedPosition(int32_t position)
Set the leftmost position (POS) using the specified 1-based (SAM format) value.
Definition: SamRecord.cpp:236
SamStatus::Status setBufferFromFile(IFILE filePtr, SamFileHeader &header)
Read the BAM record from a file.
Definition: SamRecord.cpp:558
uint16_t getFlag()
Get the flag (FLAG).
Definition: SamRecord.cpp:1384
const void * getRecordBuffer()
Get a const pointer to the buffer that contains the BAM representation of the record.
Definition: SamRecord.cpp:1204
void setSequenceTranslation(SequenceTranslation translation)
Set the type of sequence translation to use when getting the sequence.
Definition: SamRecord.cpp:187
bool checkInteger(const char *tag)
Check if the specified tag contains an integer.
Definition: SamRecord.h:607
int32_t get1BasedMatePosition()
Get the 1-based(SAM) leftmost mate/next fragment's position (PNEXT).
Definition: SamRecord.cpp:1445
int32_t get0BasedUnclippedEnd()
Returns the 0-based inclusive right-most position adjusted for clipped bases.
Definition: SamRecord.cpp:1526
bool shiftIndelsLeft()
Shift the indels (if any) to the left by updating the CIGAR.
Definition: SamRecord.cpp:368
int * getIntegerTag(const char *tag)
Get the integer value for the specified tag, DEPRECATED, use one that returns a bool (success/failure...
Definition: SamRecord.cpp:2216
const SamStatus & getStatus()
Returns the status associated with the last method that sets the status.
Definition: SamRecord.cpp:2403
static bool isCharType(char vtype)
Returns whether or not the specified vtype is a char type.
Definition: SamRecord.cpp:2062
bool setCigar(const char *cigar)
Set the CIGAR to the specified SAM formatted cigar string.
Definition: SamRecord.cpp:259
int32_t get1BasedUnclippedEnd()
Returns the 1-based inclusive right-most position adjusted for clipped bases.
Definition: SamRecord.cpp:1535
uint32_t getNumOverlaps(int32_t start, int32_t end)
Return the number of bases in this read that overlap the passed in region.
Definition: SamRecord.cpp:1853
const char * getMateReferenceName()
Get the mate/next fragment's reference sequence name (RNEXT).
Definition: SamRecord.cpp:1410
bool checkTag(const char *tag, char type)
Check if the specified tag contains a value of the specified vtype.
Definition: SamRecord.cpp:2381
bool getNextSamTag(char *tag, char &vtype, void **value)
Get the next tag from the record.
Definition: SamRecord.cpp:1962
void setReference(GenomeSequence *reference)
Set the reference to the specified genome sequence object.
Definition: SamRecord.cpp:178
bool setSequence(const char *seq)
Sets the sequence (SEQ) to the specified SAM formatted sequence string.
Definition: SamRecord.cpp:344
int32_t get0BasedUnclippedStart()
Returns the 0-based inclusive left-most position adjusted for clipped bases.
Definition: SamRecord.cpp:1506
int32_t getReadLength()
Get the length of the read.
Definition: SamRecord.cpp:1391
int32_t get0BasedAlignmentEnd()
Returns the 0-based inclusive rightmost position of the clipped sequence.
Definition: SamRecord.cpp:1467
const String * getStringTag(const char *tag)
Get the string value for the specified tag.
Definition: SamRecord.cpp:2180
bool set1BasedMatePosition(int32_t matePosition)
Set the mate/next fragment's leftmost position (PNEXT) using the specified 1-based (SAM format) value...
Definition: SamRecord.cpp:322
int32_t get0BasedPosition()
Get the 0-based(BAM) leftmost position of the record.
Definition: SamRecord.cpp:1319
const char * getCigar()
Returns the SAM formatted CIGAR string.
Definition: SamRecord.cpp:1555
uint8_t getMapQuality()
Get the mapping quality (MAPQ) of the record.
Definition: SamRecord.cpp:1340
const String & getString(const char *tag)
Get the string value for the specified tag.
Definition: SamRecord.cpp:2314
bool set0BasedPosition(int32_t position)
Set the leftmost position using the specified 0-based (BAM format) value.
Definition: SamRecord.cpp:242
const char * getReadName()
Returns the SAM formatted Read Name (QNAME).
Definition: SamRecord.cpp:1542
void resetTagIter()
Reset the tag iterator to the beginning of the tags.
Definition: SamRecord.cpp:2034
bool setQuality(const char *quality)
Sets the quality (QUAL) to the specified SAM formatted quality string.
Definition: SamRecord.cpp:357
bool setReferenceName(SamFileHeader &header, const char *referenceName)
Set the reference sequence name (RNAME) to the specified name, using the header to determine the refe...
Definition: SamRecord.cpp:223
const char * getQuality()
Returns the SAM formatted quality string (QUAL).
Definition: SamRecord.cpp:1638
~SamRecord()
Destructor.
Definition: SamRecord.cpp:72
const char * getSequence()
Returns the SAM formatted sequence string (SEQ), translating the base as specified by setSequenceTran...
Definition: SamRecord.cpp:1568
bool rmTags(const char *tags)
Remove tags.
Definition: SamRecord.cpp:1083
static bool isStringType(char vtype)
Returns whether or not the specified vtype is a string type.
Definition: SamRecord.cpp:2072
This class is used to track the status results of some methods in the BAM classes.
Definition: StatGenStatus.h:27
Status
Return value enum for StatGenFile methods.
Definition: StatGenStatus.h:32
Structure of a BAM record.
Definition: SamRecord.h:34