libStatGen Software  1
SamValidation.cpp
1 /*
2  * Copyright (C) 2010-2015 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include <iostream>
19 
20 #include "SamValidation.h"
21 #include "CigarRoller.h"
22 #include "SamTags.h"
23 
24 const char* SamValidationError::enumSeverityString[] = {
25  "WARNING", "ERROR"};
26 
27 const char* SamValidationError::enumTypeString[] = {
28  "INVALID_QNAME",
29  "INVALID_REF_ID",
30  "INVALID_RNAME",
31  "INVALID_POS",
32  "INVALID_MAPQ",
33  "INVALID_CIGAR",
34  "INVALID_MRNM",
35  "INVALID_QUAL",
36  "INVALID_TAG"
37 };
38 
40 {
41  return(enumTypeString[type]);
42 }
43 
44 
46  std::string message)
47 {
48  myType = type;
49  mySeverity = severity;
50  myMessage = message;
51 }
52 
53 
55 {
56  return(myType);
57 }
58 
59 
61 {
62  return(mySeverity);
63 }
64 
65 
66 const char* SamValidationError::getMessage() const
67 {
68  return(myMessage.c_str());
69 }
70 
71 
73 {
74  return(enumTypeString[myType]);
75 }
76 
77 
79 {
80  return(enumSeverityString[mySeverity]);
81 }
82 
83 
84 void SamValidationError::getErrorString(std::string& errorString) const
85 {
86  errorString = getTypeString();
87  errorString += " (";
88  errorString += getSeverityString();
89  errorString += ") : ";
90  errorString += getMessage();
91  errorString += "\n";
92 }
93 
94 
96 {
97  std::cerr << this;
98 }
99 
100 
101 
102 // Constructor.
104  : myValidationErrors()
105 {
106  myErrorIter = myValidationErrors.begin();
107 }
108 
109 
110 // Destructor
112 {
113  clear();
114 }
115 
116 
118 {
119  // Clear the errors.
120  std::list<const SamValidationError*>::iterator errorIter;
121  for(errorIter = myValidationErrors.begin();
122  errorIter != myValidationErrors.end(); ++errorIter)
123  {
124  delete *errorIter;
125  *errorIter = NULL;
126  }
127  myValidationErrors.clear();
128  myErrorIter = myValidationErrors.end();
129 }
130 
131 
133  SamValidationError::Severity newSeverity,
134  const char* newMessage)
135 {
136  myValidationErrors.push_back(new SamValidationError(newType,
137  newSeverity,
138  newMessage));
139 
140  // If this is the first element in the list, set the iterator.
141  if(myValidationErrors.size() == 1)
142  {
143  // set the iterator to the first element.
144  myErrorIter = myValidationErrors.begin();
145  }
146 }
147 
148 
149 
150 // Return the number of validation errors that are contained in this object.
152 {
153  return(myValidationErrors.size());
154 }
155 
156 
157 // Return a pointer to the next error. It does not remove it from the list.
158 // Returns null once all errors have been retrieved until resetErrorIter
159 // is called.
161 {
162  if(myErrorIter == myValidationErrors.end())
163  {
164  // at the end of the list, return null.
165  return(NULL);
166  }
167  // Not at the end of the list, return the last element and increment.
168  return(*myErrorIter++);
169 }
170 
171 
172 // Resets the iterator to the begining of the errors.
174 {
175  myErrorIter = myValidationErrors.begin();
176 }
177 
178 
179 // Appends the error messages to the passed in string.
180 void SamValidationErrors::getErrorString(std::string& errorString) const
181 {
182  for(std::list<const SamValidationError*>::
183  const_iterator validationErrorIter =
184  myValidationErrors.begin();
185  validationErrorIter != myValidationErrors.end();
186  validationErrorIter++)
187  {
188  std::string error = "";
189  (*validationErrorIter)->getErrorString(error);
190  errorString += error;
191  }
192 }
193 
194 
195 bool SamValidator::isValid(SamFileHeader& samHeader, SamRecord& samRecord,
196  SamValidationErrors& validationErrors)
197 {
198  bool status = true;
199  status &= isValidQname(samRecord.getReadName(),
200  samRecord.getReadNameLength(),
201  validationErrors);
202 
203  status &= isValidFlag(samRecord.getFlag(),
204  validationErrors);
205 
206  // Validate the RName including validating it against the header.
207  status &= isValidRname(samHeader,
208  samRecord.getReferenceName(),
209  validationErrors);
210 
211  status &= isValidRefID(samRecord.getReferenceID(),
212  samHeader.getReferenceInfo(),
213  validationErrors);
214 
215  status &= isValid1BasedPos(samRecord.get1BasedPosition(),
216  validationErrors);
217 
218  status &= isValidMapQuality(samRecord.getMapQuality(), validationErrors);
219 
220  status &= isValidSequence(samRecord, validationErrors);
221 
222  status &= isValidCigar(samRecord, validationErrors);
223 
224  status &= isValidQuality(samRecord, validationErrors);
225 
226  status &= isValidTags(samRecord, validationErrors);
227 
228  return(status);
229 }
230 
231 
232 // qname is the query (read) name - result of SamRecord::getReadName().
233 // readNameLen is the length of the read name including the null (the result
234 // of SamRecord::getReadNameLength()).
235 // For some invalid records, the getReadNameLength may be different than the
236 // length of qname.
237 // NOTE: Query Name and Read Name both refer to the same field.
238 bool SamValidator::isValidQname(const char* qname, uint8_t readNameLen,
239  SamValidationErrors& validationErrors)
240 {
241  // Validation for QNAME is:
242  // a) length of the qname string is the same as the read name length
243  // b) length is between 1 and 254.
244  // c) [ \t\n\r] are not allowed in the name.
245 
246  bool status = true;
247 
248  // Get the length of the qname string.
249  int32_t qnameLenNull = strlen(qname) + 1;
250 
251  ////////////////////////////////////
252  // a) length of the qname string is the same as the read name length
253  if(qnameLenNull != readNameLen)
254  {
255  // This results from a poorly formatted bam file, where the null
256  // terminated read_name field is not the same length as specified by
257  // read_name_len.
258  String message = "Invalid Query Name - the string length (";
259  message += qnameLenNull;
260  message += ") does not match the specified query name length (";
261  message += readNameLen;
262  message += ").";
263 
266  message.c_str());
267  status = false;
268  }
269 
270  ////////////////////////////////////
271  // b) length is between 1 and 254
272  // The length with the terminating null must be between 2 & 255,
273  if((qnameLenNull < 2) || (qnameLenNull > 255))
274  {
275  String message = "Invalid Query Name (QNAME) length: ";
276  message += qnameLenNull;
277  message += ". Length with the terminating null must be between 2 & 255.";
278 
281  message.c_str());
282  status = false;
283  }
284 
285  ////////////////////////////////////
286  // Loop through and validate they all characters are valid.
287  // c) [ \t\n\r] are not allowed in the name.
288  String message;
289  for(int i = 0; i < qnameLenNull; ++i)
290  {
291  switch(qname[i])
292  {
293  case ' ':
294  // Invalid character.
295  message = "Invalid character in the Query Name (QNAME): ' ' at position ";
296  message += i;
297  message += ".";
300  message.c_str());
301  status = false;
302  break;
303  case '\t':
304  // Invalid character.
305  message = "Invalid character in the Query Name (QNAME): '\t' at position ";
306  message += i;
307  message += ".";
310  message.c_str());
311  status = false;
312  break;
313  case '\n':
314  // Invalid character.
315  message = "Invalid character in the Query Name (QNAME): '\n' at position ";
316  message += i;
317  message += ".";
320  message.c_str());
321  status = false;
322  break;
323  case '\r':
324  // Invalid character.
325  message = "Invalid character in the Query Name (QNAME): '\r' at position ";
326  message += i;
327  message += ".";
330  message.c_str());
331  status = false;
332  break;
333  }
334  }
335 
336  return(status);
337 }
338 
339 
340 bool SamValidator::isValidFlag(uint16_t flag,
341  SamValidationErrors& validationErrors)
342 {
343  // All values in a uint16_t are valid, so return true.
344  return(true);
345 }
346 
347 
349  const char* rname,
350  SamValidationErrors& validationErrors)
351 {
352  bool status = true;
353 
354  // Cross validate the rname and the header.
355  // If the rname is not '*'
356  // AND there are any SQ records in the header,
357  // Then the rname must be in one of them.
358  if((strcmp(rname, "*") != 0) &&
359  (samHeader.getNumSQs() != 0) &&
360  (samHeader.getSQ(rname) == NULL))
361  {
362  // There are SQ fields, but the ref name is not in it.
363  status = false;
364  std::string message = "RNAME, ";
365  message += rname;
366  message += ", was not found in a SAM Header SQ record";
369  message.c_str());
370  }
371  status &= isValidRname(rname, validationErrors);
372  return(status);
373 }
374 
375 
376 bool SamValidator::isValidRname(const char* rname,
377  SamValidationErrors& validationErrors)
378 {
379  // Validation for RNAME is:
380  // a) cannot be 0 length.
381  // b) [ \t\n\r@=] are not allowed in the name.
382 
383  bool status = true;
384 
385  // Get the length of the rname string.
386  int32_t rnameLen = strlen(rname);
387 
388  String message;
389 
390  if(rnameLen == 0)
391  {
394  "Reference Sequence Name (RNAME) cannot have 0 length.");
395  status = false;
396  }
397 
398  ////////////////////////////////////
399  ////////////////////////////////////
400  // Loop through and validate they all characters are valid.
401  // b) [ \t\n\r] are not allowed in the name.
402  for(int i = 0; i < rnameLen; ++i)
403  {
404  switch(rname[i])
405  {
406  case ' ':
407  // Invalid character.
408  message = "Invalid character in the Reference Sequence Name (RNAME): ' ' at position ";
409  message += i;
410  message += ".";
413  message.c_str());
414  status = false;
415  break;
416  case '\t':
417  // Invalid character.
418  message = "Invalid character in the Reference Sequence Name (RNAME): '\t' at position ";
419  message += i;
420  message += ".";
423  message.c_str());
424  status = false;
425  break;
426  case '\n':
427  // Invalid character.
428  message = "Invalid character in the Reference Sequence Name (RNAME): '\n' at position ";
429  message += i;
430  message += ".";
433  message.c_str());
434  status = false;
435  break;
436  case '\r':
437  // Invalid character.
438  message = "Invalid character in the Reference Sequence Name (RNAME): '\r' at position ";
439  message += i;
440  message += ".";
443  message.c_str());
444  status = false;
445  break;
446  case '@':
447  // Invalid character.
448  message = "Invalid character in the Reference Sequence Name (RNAME): '@' at position ";
449  message += i;
450  message += ".";
453  message.c_str());
454  status = false;
455  break;
456  case '=':
457  // Invalid character.
458  message = "Invalid character in the Reference Sequence Name (RNAME): '=' at position ";
459  message += i;
460  message += ".";
463  message.c_str());
464  status = false;
465  break;
466  default:
467  // Allowed character.
468  break;
469  }
470  }
471 
472  return(status);
473 }
474 
475 
476 bool SamValidator::isValidRefID(int32_t refID,
477  const SamReferenceInfo& refInfo,
478  SamValidationErrors& validationErrors)
479 {
480  // Validation for rID is:
481  // a) must be between -1 and the number of refInfo.
482  // -1 is allowed, and otherwise it must properly index into the array.
483 
484  bool status = true;
485  if((refID < -1) || (refID >= refInfo.getNumEntries()))
486  {
487  // Reference ID is too large or too small.
488  String message = "Invalid Reference ID, out of range (";
489  message += refID;
490  message += ") must be between -1 and ";
491  message += refInfo.getNumEntries() - 1;
492  message += ".";
493 
496  message.c_str());
497  status = false;
498  }
499 
500  return(status);
501 }
502 
503 
505  SamValidationErrors& validationErrors)
506 {
507  // Validation for pos is:
508  // a) must be between 0 and (2^29)-1.
509 
510  bool status = true;
511 
512  if((pos < 0) || (pos > 536870911))
513  {
514  String message = "POS out of range (";
515  message += pos;
516  message += ") must be between 0 and (2^29)-1.";
517 
518  validationErrors.addError(SamValidationError::INVALID_POS,
520  message.c_str());
521  status = false;
522  }
523 
524  return(status);
525 }
526 
527 
528 bool SamValidator::isValidMapQuality(uint8_t mapQuality,
529  SamValidationErrors& validationErrors)
530 {
531  // All values in a uint8_t are valid, so return true.
532  return(true);
533 }
534 
535 
537  SamValidationErrors& validationErrors)
538 {
539  return(true);
540 }
541 
542 
544  SamValidationErrors& validationErrors)
545 {
546  return(isValidCigar(samRecord.getCigar(),
547  samRecord.getReadLength(),
548  validationErrors));
549 }
550 
551 bool SamValidator::isValidCigar(const char* cigar,
552  const char* sequence,
553  SamValidationErrors& validationErrors)
554 {
555  if(strcmp(sequence, "*") != 0)
556  {
557  return(isValidCigar(cigar, strlen(sequence), validationErrors));
558  }
559  // Sequence is '*', so the length is 0.
560  return(isValidCigar(cigar, 0, validationErrors));
561 }
562 
563 bool SamValidator::isValidCigar(const char* cigar,
564  int seqLen,
565  SamValidationErrors& validationErrors)
566 {
567  // Validation for CIGAR is:
568  // a) cannot be 0 length.
569  // if not "*", validate the following:
570  // b) must have an integer length for each operator (if not "*"). TODO
571  // c) all operators must be valid (if not "*"). TODO
572  // d) evaluates to the same read length as the sequence string if not '*'.
573  bool status = true;
574  String message;
575 
576  int32_t cigarLen = strlen(cigar);
577 
578  // a) cannot be 0 length.
579  if(cigarLen == 0)
580  {
583  "Cigar must not be blank.");
584  status = false;
585  }
586 
587  if(strcmp(cigar, "*") != 0)
588  {
589  // The cigar is not "*", so validate it.
590  CigarRoller cigarRoller(cigar);
591 
592  // b) must have an integer length for each operator.
593  // TODO
594  // c) all operators must be valid.
595  // TODO
596 
597  // d) is the same length as the sequence string.
598  int cigarSeqLen = cigarRoller.getExpectedQueryBaseCount();
599  if((cigarSeqLen != seqLen) && (seqLen != 0))
600  {
601  message = "CIGAR does not evaluate to the same length as SEQ, (";
602  message += cigarSeqLen;
603  message += " != ";
604  message += seqLen;
605  message += ").";
608  message.c_str());
609  status = false;
610  }
611  }
612  return(status);
613 }
614 
615 
617  SamValidationErrors& validationErrors)
618 {
619  return(isValidQuality(samRecord.getQuality(),
620  samRecord.getReadLength(),
621  validationErrors));
622 }
623 
624 
625 bool SamValidator::isValidQuality(const char* quality,
626  const char* sequence,
627  SamValidationErrors& validationErrors)
628 {
629  // Determine the length of the sequence.
630  int seqLen = strlen(sequence);
631 
632  // Check if the sequence is '*' since then the seqLength is 0.
633  if(strcmp(sequence, "*") == 0)
634  {
635  seqLen = 0;
636  }
637  return(isValidQuality(quality, seqLen, validationErrors));
638 }
639 
640 
641 bool SamValidator::isValidQuality(const char* quality,
642  int seqLength,
643  SamValidationErrors& validationErrors)
644 {
645  bool status = true;
646 
647  // If the quality or the sequence are non-"*", validate that the quality
648  // and sequence have the same length.
649  if((seqLength != 0) && (strcmp(quality, "*") != 0))
650  {
651  int qualLen = strlen(quality);
652  // Both the sequence and the quality are not "*", so validate
653  // that they are the same length.
654  if(seqLength != qualLen)
655  {
656  // Both fields are specified but are different lengths.
657 
658  String message = "QUAL is not the same length as SEQ, (";
659  message += qualLen;
660  message += " != ";
661  message += seqLength;
662  message += ").";
663 
666  message.c_str());
667  status = false;
668  }
669  }
670  return(status);
671 }
672 
673 
675  SamValidationErrors& validationErrors)
676 {
677  bool status = true;
678 
679  GenomeSequence* reference = samRecord.getReference();
680  // If the reference is not null, check the MD tag.
681  if(reference != NULL)
682  {
683  const String* recordMD = samRecord.getStringTag(SamTags::MD_TAG);
684  if(recordMD != NULL)
685  {
686  // The record has an MD tag so check to see if it is correct.
687  if(!SamTags::isMDTagCorrect(samRecord, *reference))
688  {
689  // Invalid MD tags.
690  String correctMD;
691  if(!SamTags::createMDTag(correctMD, samRecord, *reference))
692  {
693  // Failed to get the MD tag, so indicate that it is unknown.
694  correctMD = "UNKNOWN";
695  }
696  String message = "Incorrect MD Tag, ";
697  message += *recordMD;
698  message += ", should be ";
699  message += correctMD;
700  message += ".";
701 
702  validationErrors.addError(SamValidationError::INVALID_TAG,
704  message.c_str());
705 
706  status = false;
707  }
708  }
709  }
710 
711  return(status);
712 }
The purpose of this class is to provide accessors for setting, updating, modifying the CIGAR object....
Definition: CigarRoller.h:67
int getExpectedQueryBaseCount() const
Return the length of the read that corresponds to the current CIGAR string.
Definition: Cigar.cpp:95
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
This class allows a user to get/set the fields in a SAM/BAM Header.
Definition: SamFileHeader.h:35
SamHeaderSQ * getSQ(const char *name)
Get the SQ object with the specified sequence name, returning NULL if there is no SQ object with that...
int getNumSQs()
Get the number of SQ objects.
const SamReferenceInfo & getReferenceInfo() const
Get the Reference Information.
Class providing an easy to use interface to get/set/operate on the fields in a SAM/BAM record.
Definition: SamRecord.h:52
const char * getReferenceName()
Get the reference sequence name (RNAME) of the record.
Definition: SamRecord.cpp:1298
int32_t get1BasedPosition()
Get the 1-based(SAM) leftmost position (POS) of the record.
Definition: SamRecord.cpp:1312
int32_t getReferenceID()
Get the reference sequence id of the record (BAM format rid).
Definition: SamRecord.cpp:1305
GenomeSequence * getReference()
Returns a pointer to the genome sequence object associated with this record if it was set (NULL if it...
Definition: SamRecord.cpp:1923
uint8_t getReadNameLength()
Get the length of the readname (QNAME) including the null.
Definition: SamRecord.cpp:1326
uint16_t getFlag()
Get the flag (FLAG).
Definition: SamRecord.cpp:1384
int32_t getReadLength()
Get the length of the read.
Definition: SamRecord.cpp:1391
const String * getStringTag(const char *tag)
Get the string value for the specified tag.
Definition: SamRecord.cpp:2180
const char * getCigar()
Returns the SAM formatted CIGAR string.
Definition: SamRecord.cpp:1555
uint8_t getMapQuality()
Get the mapping quality (MAPQ) of the record.
Definition: SamRecord.cpp:1340
const char * getReadName()
Returns the SAM formatted Read Name (QNAME).
Definition: SamRecord.cpp:1542
const char * getQuality()
Returns the SAM formatted quality string (QUAL).
Definition: SamRecord.cpp:1638
Class for tracking the reference information mapping between the reference ids and the reference name...
int32_t getNumEntries() const
Get the number of entries contained here.
static bool isMDTagCorrect(SamRecord &inputRec, GenomeSequence &genome)
Check to see if the MD tag in the record is accurate.
Definition: SamTags.cpp:126
static bool createMDTag(String &outputMDtag, SamRecord &inputRec, GenomeSequence &genome)
Create the MD tag for the specified input record and the genome.
Definition: SamTags.cpp:34
The SamValidationError class describes a validation error that occured, containing the error type,...
Definition: SamValidation.h:35
Type getType() const
Return the type enum of this validation error object.
const char * getSeverityString() const
Return the string representing this object's severity of validation error.
void printError() const
Print a formatted output of the error to cerr.
void getErrorString(std::string &errorString) const
Get the error string representing this object's error.
Severity
Severity of the error.
Definition: SamValidation.h:39
@ WARNING
Warning is used if it is just an invalid value.
Definition: SamValidation.h:40
@ ERROR
Error is used if parsing could not succeed.
Definition: SamValidation.h:41
const char * getMessage() const
Return the error message of this validation error object.
SamValidationError(Type type, Severity severity, std::string Message)
Constructor that sets the type, severity, and message for the validation error.
Type
Type of the error.
Definition: SamValidation.h:48
@ INVALID_REF_ID
Invalid reference id.
Definition: SamValidation.h:50
@ INVALID_TAG
Invalid tag.
Definition: SamValidation.h:57
@ INVALID_QNAME
Invalid read/query name.
Definition: SamValidation.h:49
@ INVALID_CIGAR
Invalid CIGAR.
Definition: SamValidation.h:54
@ INVALID_POS
Invalid position.
Definition: SamValidation.h:52
@ INVALID_RNAME
Invalid reference name.
Definition: SamValidation.h:51
@ INVALID_QUAL
Invalid base quality.
Definition: SamValidation.h:56
Severity getSeverity() const
Return the severity enum of this validation error object.
const char * getTypeString() const
Return the string representing this object's type of validation error.
The SamValidationErrors class is a container class that holds SamValidationError Objects,...
void getErrorString(std::string &errorString) const
Append the error messages contained in this container to the passed in string.
const SamValidationError * getNextError()
Return a pointer to the next error without removing it from the container, and returning null once al...
SamValidationErrors()
Constructor.
void resetErrorIter()
Reset the iterator to the begining of the errors.
void clear()
Remove all the errors from the container.
unsigned int numErrors()
Return the number of validation errors contained in this object.
void addError(SamValidationError::Type newType, SamValidationError::Severity newSeverity, const char *newMessage)
Add the specified error to this container.
~SamValidationErrors()
Destructor.
static bool isValidQname(const char *qname, uint8_t qnameLen, SamValidationErrors &validationErrors)
Determines whether or not the specified qname is valid.
static bool isValidTags(SamRecord &samRecord, SamValidationErrors &validationErrors)
Validate the tags.
static bool isValidFlag(uint16_t flag, SamValidationErrors &validationErrors)
Determines whether or not the flag is valid.
static bool isValidQuality(SamRecord &samRecord, SamValidationErrors &validationErrors)
Validate the base quality.
static bool isValid(SamFileHeader &samHeader, SamRecord &samRecord, SamValidationErrors &validationErrors)
Validates whether or not the specified SamRecord is valid, calling all of the other validations.
static bool isValid1BasedPos(int32_t pos, SamValidationErrors &validationErrors)
Validate the refeference position.
static bool isValidRname(SamFileHeader &samHeader, const char *rname, SamValidationErrors &validationErrors)
Validate the reference name including validating against the header.
static bool isValidRefID(int32_t refID, const SamReferenceInfo &refInfo, SamValidationErrors &validationErrors)
Validate whether or not the specified reference id is valid.
static bool isValidCigar(SamRecord &samRecord, SamValidationErrors &validationErrors)
Validate the cigar.
static bool isValidMapQuality(uint8_t mapQuality, SamValidationErrors &validationErrors)
Validate the mapping quality.
static bool isValidSequence(SamRecord &samRecord, SamValidationErrors &validationErrors)
Validate the sequence, but not against the cigar or quality string.