libStatGen Software  1
FastQFile.cpp
1 /*
2  * Copyright (C) 2010 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include <iostream>
19 
20 #include "InputFile.h"
21 #include "FastQFile.h"
22 #include "BaseUtilities.h"
23 
24 // Constructor.
25 // minReadLength - The minimum length that a base sequence must be for
26 // it to be valid.
27 // numPrintableErrors - The maximum number of errors that should be reported
28 // in detail before suppressing the errors.
29 //
30 FastQFile::FastQFile(int minReadLength, int numPrintableErrors)
31  : myFile(NULL),
32  myBaseComposition(),
33  myQualPerCycle(),
34  myCountPerCycle(),
35  myCheckSeqID(true),
36  myInterleaved(false),
37  myPrevSeqID(""),
38  myMinReadLength(minReadLength),
39  myNumPrintableErrors(numPrintableErrors),
40  myMaxErrors(-1),
41  myDisableMessages(false),
42  myFileProblem(false)
43 {
44  // Reset the member data.
45  reset();
46 }
47 
48 
50 {
51  myDisableMessages = true;
52 }
53 
54 
56 {
57  myDisableMessages = false;
58 }
59 
60 
61 // Disable Unique Sequence ID checking.
62 // Unique Sequence ID checking is enabled by default.
64 {
65  myCheckSeqID = false;
66 }
67 
68 
69 // Enable Unique Sequence ID checking.
70 // Unique Sequence ID checking is enabled by default.
72 {
73  myCheckSeqID = true;
74 }
75 
76 
77 /// Interleaved.
79 {
80  myInterleaved = true;
81 }
82 
83 
84 // Set the number of errors after which to quit reading/validating a file.
85 void FastQFile::setMaxErrors(int maxErrors)
86 {
87  myMaxErrors = maxErrors;
88 }
89 
90 
91 // Open a FastQFile.
93  BaseAsciiMap::SPACE_TYPE spaceType)
94 {
95  // reset the member data.
96  reset();
97 
98  myBaseComposition.resetBaseMapType();
99  myBaseComposition.setBaseMapType(spaceType);
100  myQualPerCycle.clear();
101  myCountPerCycle.clear();
102 
104 
105  // Close the file if there is already one open - checked by close.
106  status = closeFile();
107  if(status == FastQStatus::FASTQ_SUCCESS)
108  {
109  // Successfully closed a previously opened file if there was one.
110 
111  // Open the file
112  myFile = ifopen(fileName, "rt");
113  myFileName = fileName;
114 
115  if(myFile == NULL)
116  {
117  // Failed to open the file.
119  }
120  }
121 
122  if(status != FastQStatus::FASTQ_SUCCESS)
123  {
124  // Failed to open the file.
125  std::string errorMessage = "ERROR: Failed to open file: ";
126  errorMessage += fileName;
127  logMessage(errorMessage.c_str());
128  }
129  return(status);
130 }
131 
132 
133 // Close a FastQFile.
135 {
136  int closeStatus = 0; // Success.
137 
138  // If a file has been opened, close it.
139  if(myFile != NULL)
140  {
141  // Close the file.
142  closeStatus = ifclose(myFile);
143  myFile = NULL;
144  }
145  if(closeStatus == 0)
146  {
147  // Success - either there wasn't a file to close or it was closed
148  // successfully.
150  }
151  else
152  {
153  std::string errorMessage = "Failed to close file: ";
154  errorMessage += myFileName.c_str();
155  logMessage(errorMessage.c_str());
157  }
158 }
159 
160 
161 // Check to see if the file is open.
163 {
164  // Check to see if the file is open.
165  if((myFile != NULL) && (myFile->isOpen()))
166  {
167  // File pointer exists and the file is open.
168  return true;
169  }
170 
171  // File is not open.
172  return false;
173 }
174 
175 
176 // Check to see if the file is at the end of the file.
178 {
179  // Check to see if the file is open.
180  if((myFile != NULL) && (ifeof(myFile)))
181  {
182  // At EOF.
183  return true;
184  }
185 
186  // Not at EOF.
187  return false;
188 }
189 
190 
191 // Returns whether or not to keep reading the file.
192 // Stop reading (false) if eof or there is a problem reading the file.
194 {
195  if(isEof() || myFileProblem)
196  {
197  return(false);
198  }
199  return(true);
200 }
201 
202 
203 // Validate the specified fastq file
205  bool printBaseComp,
206  BaseAsciiMap::SPACE_TYPE spaceType,
207  bool printQualAvg)
208 {
209  // Open the fastqfile.
210  if(openFile(filename, spaceType) != FastQStatus::FASTQ_SUCCESS)
211  {
212  // Failed to open the specified file.
214  }
215 
216  // Track the total number of sequences that were validated.
217  int numSequences = 0;
218 
219  // Keep reading the file until there are no more fastq sequences to process
220  // and not configured to quit after a certain number of errors or there
221  // has not yet been that many errors.
222  // Or exit if there is a problem reading the file.
224  while (keepReadingFile() &&
225  ((myMaxErrors == -1) || (myMaxErrors > myNumErrors)))
226  {
227  // Validate one sequence. This call will read all the lines for
228  // one sequence.
229  status = readFastQSequence();
230  if((status == FastQStatus::FASTQ_SUCCESS) || (status == FastQStatus::FASTQ_INVALID))
231  {
232  // Read a sequence and it is either valid or invalid, but
233  // either way, a sequence was read, so increment the sequence count.
234  ++numSequences;
235  }
236  else
237  {
238  // Other error, so break out of processing.
239  break;
240  }
241  }
242 
243  // Report Base Composition Statistics.
244  if(printBaseComp)
245  {
246  myBaseComposition.print();
247  }
248 
249  if(printQualAvg)
250  {
251  printAvgQual();
252  }
253 
254  std::string finishMessage = "Finished processing ";
255  finishMessage += myFileName.c_str();
256  char buffer[100];
257  if(sprintf(buffer,
258  " with %u lines containing %d sequences.",
259  myLineNum, numSequences) > 0)
260  {
261  finishMessage += buffer;
262  logMessage(finishMessage.c_str());
263  }
264  if(sprintf(buffer,
265  "There were a total of %d errors.",
266  myNumErrors) > 0)
267  {
268  logMessage(buffer);
269  }
270 
271  // Close the input file.
272  FastQStatus::Status closeStatus = closeFile();
273 
274  if((status != FastQStatus::FASTQ_SUCCESS) && (status != FastQStatus::FASTQ_INVALID) &&
276  {
277  // Stopped validating due to some error other than invalid, so
278  // return that error.
279  return(status);
280  }
281  else if(myNumErrors == 0)
282  {
283  // No errors, check to see if there were any sequences.
284  // Finished processing all of the sequences in the file.
285  // If there are no sequences, report an error.
286  if(numSequences == 0)
287  {
288  // Empty file, return error.
289  logMessage("ERROR: No FastQSequences in the file.");
291  }
293  }
294  else
295  {
296  // The file is invalid. But check the close status. If the close
297  // failed, it means there is a problem with the file itself not just
298  // with validation, so the close failure should be returned.
299  if(closeStatus != FastQStatus::FASTQ_SUCCESS)
300  {
301  return(closeStatus);
302  }
304  }
305 }
306 
307 
308 // Reads and validates a single fastq sequence from myFile.
310 {
311  // First verify that a file is open, if not, return failure.
312  if(!isOpen())
313  {
314  std::string message =
315  "ERROR: Trying to read a fastq file but no file is open.";
316  logMessage(message.c_str());
318  }
319 
320  // Reset variables for each sequence.
321  resetForEachSequence();
322 
323  bool valid = true;
324 
325  // No sequence was read.
326  if(isTimeToQuit())
327  {
329  }
330 
331  // The first line is the sequence identifier, so validate that.
332  valid = validateSequenceIdentifierLine();
333 
334  if(myFileProblem)
335  {
337  }
338 
339  // If we are at the end of the file, check to see if it is a partial
340  // sequence or just an empty line at the end.
341  if(ifeof(myFile))
342  {
343  // If the sequence identifier line was empty and we are at the
344  // end of the file, there is nothing more to validate.
345  if(mySequenceIdLine.Length() != 0)
346  {
347  // There was a sequence identifier line, so this is an incomplete
348  // sequence.
349  myErrorString = "Incomplete Sequence.\n";
350  reportErrorOnLine();
351 
352  valid = false;
353  }
354  if(valid)
355  {
356  // Return failure - no sequences were left to read. At the end
357  // of the file. It wasn't invalid and it wasn't really an error.
359  }
360  else
361  {
363  }
364  }
365 
366  // If enough errors, quit before reading any more.
367  if(isTimeToQuit())
368  {
369  // Means there was an error, so mark it as invalid.
371  }
372 
373  // Validate the Raw Sequence Line(s) and the "+" line.
374  valid &= validateRawSequenceAndPlusLines();
375 
376  if(myFileProblem)
377  {
379  }
380 
381  // If enough errors, quit before reading any more.
382  if(isTimeToQuit())
383  {
385  }
386 
387  // If it is the end of a file, it is missing the quality string.
388  if(ifeof(myFile))
389  {
390  // There was a sequence identifier line, so this is an incomplete
391  // sequence.
392  myErrorString = "Incomplete Sequence, missing Quality String.";
393  reportErrorOnLine();
394  valid = false;
396  }
397 
398  // All that is left is to validate the quality string line(s).
399  valid &= validateQualityStringLines();
400 
401  if(myFileProblem)
402  {
404  }
405 
406  if(valid)
407  {
409  }
411 }
412 
413 
414 // Reads and validates the sequence identifier line of a fastq sequence.
415 bool FastQFile::validateSequenceIdentifierLine()
416 {
417  // Read the first line of the sequence.
418  int readStatus = mySequenceIdLine.ReadLine(myFile);
419 
420  // Check to see if the read was successful.
421  if(readStatus <= 0)
422  {
423  // If EOF, not an error.
424  if(ifeof(myFile))
425  {
426  return true;
427  }
428  myFileProblem = true;
429  myErrorString = "Failure trying to read sequence identifier line";
430  reportErrorOnLine();
431  return false;
432  }
433 
434  // If the line is 0 length and it is the end of the file, just
435  // return since this is the eof - no error.
436  if((mySequenceIdLine.Length() == 0) && (ifeof(myFile)))
437  {
438  // Not an error, just a new line at the end of the file.
439  return true;
440  }
441 
442  // Increment the line number.
443  myLineNum++;
444 
445  // Verify that the line has at least 2 characters: '@' and at least
446  // one character for the sequence identifier.
447  if(mySequenceIdLine.Length() < 2)
448  {
449  // Error. Sequence Identifier line not long enough.
450  myErrorString = "The sequence identifier line was too short.";
451  reportErrorOnLine();
452  return false;
453  }
454 
455  // The sequence identifier line must start wtih a '@'
456  if(mySequenceIdLine[0] != '@')
457  {
458  // Error - sequence identifier line does not begin with an '@'.
459  myErrorString = "First line of a sequence does not begin with @";
460  reportErrorOnLine();
461  return false;
462  }
463 
464  // Valid Sequence Identifier Line.
465 
466  // The sequence identifier ends at the first space or at the end of the
467  // line if there is no space.
468  // Use fast find since this is a case insensitive search.
469  // Start at 1 since we know that 0 is '@'
470  int endSequenceIdentifier = mySequenceIdLine.FastFindChar(' ', 1);
471 
472  // Check if a " " was found.
473  if(endSequenceIdentifier == -1)
474  {
475  // Did not find a ' ', so the identifier is the rest of the line.
476  // It starts at 1 since @ is at offset 0.
477  mySequenceIdentifier = (mySequenceIdLine.SubStr(1)).c_str();
478  }
479  else
480  {
481  // Found a ' ', so the identifier ends just before that.
482  // The sequence identifier must be at least 1 character long,
483  // therefore the endSequenceIdentifier must be greater than 1.
484  if(endSequenceIdentifier <= 1)
485  {
486  myErrorString =
487  "No Sequence Identifier specified before the comment.";
488  reportErrorOnLine();
489  return false;
490  }
491 
492  mySequenceIdentifier =
493  (mySequenceIdLine.SubStr(1, endSequenceIdentifier - 1)).c_str();
494  }
495 
496  // If myInterleaved, validate matches the previous seqID.
497  if(myInterleaved && (myPrevSeqID != ""))
498  {
499  // Valid if the sequence identifiers are identical or if
500  // the only difference is a trailing 1 or 2.
501  if(myPrevSeqID.compare(mySequenceIdentifier) != 0)
502  {
503  // Compare all but the last characters, then check the last characters for 1 or 2.
504  if((myPrevSeqID.compare(0, myPrevSeqID.length()-1, mySequenceIdentifier.c_str(), mySequenceIdentifier.Length()-1) != 0) ||
505  (((myPrevSeqID[myPrevSeqID.length()-1] != '1') || (mySequenceIdentifier[mySequenceIdentifier.Length()-1] != '2')) &&
506  (myPrevSeqID[myPrevSeqID.length()-1] != mySequenceIdentifier[mySequenceIdentifier.Length()-1])))
507  {
508  myErrorString = "Interleaved: consecutive reads do not have matching sequence identifiers: ";
509  myErrorString += mySequenceIdentifier.c_str();
510  myErrorString += " and ";
511  myErrorString += myPrevSeqID.c_str();
512  reportErrorOnLine();
513  myPrevSeqID.clear();
514  return(false);
515  }
516  }
517  myPrevSeqID.clear();
518  }
519  else
520  {
521  if(myInterleaved)
522  {
523  myPrevSeqID = mySequenceIdentifier.c_str();
524  }
525 
526  // Check if sequence identifier should be validated for uniqueness if it is
527  // not the 2nd in an interleaved pair.
528  if(myCheckSeqID)
529  {
530  // Check to see if the sequenceIdentifier is a repeat by adding
531  // it to the set and seeing if it already existed.
532  std::pair<std::map<std::string, unsigned int>::iterator,bool> insertResult;
533  insertResult =
534  myIdentifierMap.insert(std::make_pair(mySequenceIdentifier.c_str(),
535  myLineNum));
536 
537  if(insertResult.second == false)
538  {
539  // Sequence Identifier is a repeat.
540  myErrorString = "Repeated Sequence Identifier: ";
541  myErrorString += mySequenceIdentifier.c_str();
542  myErrorString += " at Lines ";
543  myErrorString += insertResult.first->second;
544  myErrorString += " and ";
545  myErrorString += myLineNum;
546  reportErrorOnLine();
547  return(false);
548  }
549  }
550  }
551 
552  // Valid, return true.
553  return(true);
554 }
555 
556 
557 // Reads and validates the raw sequence line(s) and the plus line. Both are
558 // included in one method since it is unknown when the raw sequence line
559 // ends until you find the plus line that divides it from the quality
560 // string. Since this method will read the plus line to know when the
561 // raw sequence ends, it also validates that line.
562 bool FastQFile::validateRawSequenceAndPlusLines()
563 {
564  // Read the raw sequence.
565  int readStatus = myRawSequence.ReadLine(myFile);
566 
567  myLineNum++;
568 
569  if(readStatus <= 0)
570  {
571  myFileProblem = true;
572  myErrorString = "Failure trying to read sequence line";
573  reportErrorOnLine();
574  return false;
575  }
576 
577  // Offset into the raw sequence to be validated.
578  int offset = 0;
579 
580  // Validate the raw sequence.
581  bool valid = validateRawSequence(offset);
582 
583  // Increment the offset for what was just read.
584  offset = myRawSequence.Length();
585 
586  // The next line is either a continuation of the raw sequence or it starts
587  // with a '+'
588  // Keep validating til the '+' line or the end of file is found.
589  bool stillRawLine = true;
590  while(stillRawLine &&
591  !ifeof(myFile))
592  {
593  // If enough errors, quit before reading any more.
594  if(isTimeToQuit())
595  {
596  return(false);
597  }
598 
599  // Read the next line.
600  // Read into the plus line, but if it isn't a plus line, then
601  // it will be copied into the raw sequence line.
602  readStatus = myPlusLine.ReadLine(myFile);
603  myLineNum++;
604 
605  if(readStatus <= 0)
606  {
607  myFileProblem = true;
608  myErrorString = "Failure trying to read sequence/plus line";
609  reportErrorOnLine();
610  return false;
611  }
612 
613  // Check if the next line is blank
614  if(myPlusLine.Length() == 0)
615  {
616  // The next line is blank. Assume it is part of the raw sequence and
617  // report an error since there are no valid characters on the line.
618  myErrorString =
619  "Looking for continuation of Raw Sequence or '+' instead found a blank line, assuming it was part of Raw Sequence.";
620  reportErrorOnLine();
621  }
622  // Check for the plus line.
623  else if(myPlusLine[0] == '+')
624  {
625  // This is the + line.
626  valid &= validateSequencePlus();
627  stillRawLine = false;
628  }
629  else
630  {
631  // Not a plus line, so assume this is a continuation of the Raw
632  // Sequence.
633  // Copy from the plus line to the raw sequence line.
634  myRawSequence += myPlusLine;
635  myPlusLine.SetLength(0);
636  valid &= validateRawSequence(offset);
637 
638  // Increment the offset.
639  offset = myRawSequence.Length();
640  }
641  }
642 
643  // If enough errors, quit before reading any more.
644  if(isTimeToQuit())
645  {
646  return(false);
647  }
648 
649  // Now that the entire raw sequence has been obtained, check its length
650  // against the minimum allowed length.
651  if(myRawSequence.Length() < myMinReadLength)
652  {
653  // Raw sequence is not long enough - error.
654  myErrorString = "Raw Sequence is shorter than the min read length: ";
655  myErrorString += myRawSequence.Length();
656  myErrorString += " < ";
657  myErrorString += myMinReadLength;
658  reportErrorOnLine();
659  valid = false;
660  }
661 
662  // If enough errors, quit before reading any more.
663  if(isTimeToQuit())
664  {
665  return(false);
666  }
667 
668  // if the flag still indicates it is processing the raw sequence that means
669  // we reached the end of the file without a '+' line. So report that error.
670  if(stillRawLine)
671  {
672  myErrorString = "Reached the end of the file without a '+' line.";
673  reportErrorOnLine();
674  valid = false;
675  }
676 
677  return(valid);
678 }
679 
680 
681 // Reads and validates the quality string line(s).
682 bool FastQFile::validateQualityStringLines()
683 {
684  // Read the quality string.
685  int readStatus = myQualityString.ReadLine(myFile);
686  myLineNum++;
687 
688  if(readStatus <= 0)
689  {
690  myFileProblem = true;
691  myErrorString = "Failure trying to read quality line";
692  reportErrorOnLine();
693  return false;
694  }
695 
696  // track the offset into the quality string to validate.
697  int offset = 0;
698 
699  // Validate this line of the quality string.
700  bool valid = validateQualityString(offset);
701 
702  offset = myQualityString.Length();
703 
704  // Keep reading quality string lines until the length of the
705  // raw sequence has been hit or the end of the file is reached.
706  while((myQualityString.Length() < myRawSequence.Length()) &&
707  (!ifeof(myFile)))
708  {
709  // If enough errors, quit before reading any more.
710  if(isTimeToQuit())
711  {
712  return(false);
713  }
714 
715  // Read another line of the quality string.
716  readStatus = myTempPartialQuality.ReadLine(myFile);
717  myLineNum++;
718 
719  if(readStatus <= 0)
720  {
721  myFileProblem = true;
722  myErrorString = "Failure trying to read quality line";
723  reportErrorOnLine();
724  return false;
725  }
726 
727  myQualityString += myTempPartialQuality;
728  myTempPartialQuality.Clear();
729 
730  // Validate this line of the quality string.
731  valid &= validateQualityString(offset);
732  offset = myQualityString.Length();
733  }
734 
735  // If enough errors, quit before reading any more.
736  if(isTimeToQuit())
737  {
738  return(false);
739  }
740 
741  // Validate that the quality string length is the same as the
742  // raw sequence length.
743  if(myQualityString.Length() != myRawSequence.Length())
744  {
745  myErrorString = "Quality string length (";
746  myErrorString += myQualityString.Length();
747  myErrorString += ") does not equal raw sequence length (";
748  myErrorString += myRawSequence.Length();
749  myErrorString += ")";
750  reportErrorOnLine();
751  valid = false;
752  }
753  return(valid);
754 }
755 
756 
757 // Method to validate a line that contains part of the raw sequence.
758 bool FastQFile::validateRawSequence(int offset)
759 {
760  bool validBase = false;
761  bool valid = true;
762 
763  // Loop through validating each character is valid for the raw sequence.
764  for(int sequenceIndex = offset; sequenceIndex < myRawSequence.Length();
765  sequenceIndex++)
766  {
767  // Update the composition for this position. Returns false if the
768  // character was not a valid base.
769  validBase =
770  myBaseComposition.updateComposition(sequenceIndex,
771  myRawSequence[sequenceIndex]);
772  // Check the return
773  if(!validBase)
774  {
775  // Error, found a value that is not a valid base character.
776  myErrorString = "Invalid character ('";
777  myErrorString += myRawSequence[sequenceIndex];
778  myErrorString += "') in base sequence.";
779  reportErrorOnLine();
780  valid = false;
781  // If enough errors, quit before reading any more.
782  if(isTimeToQuit())
783  {
784  return(false);
785  }
786  }
787  }
788  return(valid);
789 }
790 
791 
792 // Method to validate the "+" line that seperates the raw sequence and the
793 // quality string.
794 bool FastQFile::validateSequencePlus()
795 {
796  // Validate that optional sequence identifier is the same
797  // as the one on the @ line.
798 
799  // Check to see if there is more to the line than just the plus
800  int lineLength = myPlusLine.Length();
801 
802  // If the line is only 1 character or the second character is a space,
803  // then there is no sequence identifier on this line and there is nothing
804  // further to validate.
805  if((lineLength == 1) || (myPlusLine[1] == ' '))
806  {
807  // No sequence identifier, so just return valid.
808  return true;
809  }
810 
811  // There is a sequence identifier on this line, so validate that
812  // it matches the one from the associated @ line.
813  // The read in line must be at least 1 more character ('+') than the
814  // sequence identifier read from the '@' line.
815  // If it is not longer than the sequence identifier, then we know that it
816  // cannot be the same.
817  int sequenceIdentifierLength = mySequenceIdentifier.Length();
818  if(lineLength <= sequenceIdentifierLength)
819  {
820  myErrorString =
821  "Sequence Identifier on '+' line does not equal the one on the '@' line.";
822  reportErrorOnLine();
823  return false;
824  }
825 
826  bool same = true;
827  int seqIndex = 0;
828  int lineIndex = 1; // Start at 1 since offset 0 has '+'
829 
830  // Loop through the sequence index and the line buffer verifying they
831  // are the same until a difference is found or the end of the sequence
832  // identifier is found.
833  while((same == true) && (seqIndex < sequenceIdentifierLength))
834  {
835  if(myPlusLine[lineIndex] != mySequenceIdentifier[seqIndex])
836  {
837  myErrorString =
838  "Sequence Identifier on '+' line does not equal the one on the '@' line.";
839  reportErrorOnLine();
840  same = false;
841  }
842  lineIndex++;
843  seqIndex++;
844  }
845  return(same);
846 }
847 
848 
849 // Method to validate the quality string.
850 bool FastQFile::validateQualityString(int offset)
851 {
852  bool valid = true;
853  if(myQualityString.Length() > (int)(myQualPerCycle.size()))
854  {
855  myQualPerCycle.resize(myQualityString.Length());
856  myCountPerCycle.resize(myQualityString.Length());
857  }
858  // For each character in the line, verify that it is ascii > 32.
859  for(int i=offset; i < myQualityString.Length(); i++)
860  {
861  if(myQualityString[i] <= 32)
862  {
863  myErrorString = "Invalid character ('";
864  myErrorString += myQualityString[i];
865  myErrorString += "') in quality string.";
866  reportErrorOnLine();
867  valid = false;
868  // If enough errors, quit before reading any more.
869  if(isTimeToQuit())
870  {
871  return(false);
872  }
873  }
874  else
875  {
876  myQualPerCycle[i] += BaseUtilities::getPhredBaseQuality(myQualityString[i]);
877  myCountPerCycle[i] += 1;
878  }
879  }
880  return(valid);
881 }
882 
883 
884 // Helper method for printing the contents of myErrorString. It will
885 // only print the errors until the maximum number of reportable errors is
886 // reached.
887 void FastQFile::reportErrorOnLine()
888 {
889  // Increment the total number of errors.
890  myNumErrors++;
891 
892  // Only display the first X number of errors.
893  if(myNumErrors <= myNumPrintableErrors)
894  {
895  // Write the error with the line number.
896  char buffer[100];
897  sprintf(buffer, "ERROR on Line %u: ", myLineNum);
898  std::string message = buffer;
899  message += myErrorString.c_str();
900  logMessage(message.c_str());
901  }
902 }
903 
904 
905 // Reset member data that is unique for each fastQFile.
906 void FastQFile::reset()
907 {
908  // Each fastq file processing needs to also reset the member data for each
909  // sequence.
910  resetForEachSequence();
911  myNumErrors = 0; // per fastqfile
912  myLineNum = 0; // per fastqfile
913  myFileName.SetLength(0); // reset the filename string.
914  myIdentifierMap.clear(); // per fastqfile
915  myBaseComposition.clear(); // clear the base composition.
916  myQualPerCycle.clear();
917  myCountPerCycle.clear();
918  myFileProblem = false;
919 }
920 
921 
922 // Reset the member data that is unique for each sequence.
923 void FastQFile::resetForEachSequence()
924 {
925  myLineBuffer.SetLength(0);
926  myErrorString.SetLength(0);
927  myRawSequence.SetLength(0);
928  mySequenceIdLine.SetLength(0);
929  mySequenceIdentifier.SetLength(0);
930  myPlusLine.SetLength(0);
931  myQualityString.SetLength(0);
932  myTempPartialQuality.SetLength(0);
933 }
934 
935 
936 void FastQFile::logMessage(const char* logMessage)
937 {
938  // Write the message if they are not disabled.
939  if(!myDisableMessages)
940  {
941  std::cout << logMessage << std::endl;
942  }
943 }
944 
945 
946 // Determine if it is time to quit by checking if we are to quit after a
947 // certain number of errors and that many errors have been encountered.
948 bool FastQFile::isTimeToQuit()
949 {
950  // It is time to quit if we are to quit after a certain number of errors
951  // and that many errors have been encountered.
952  if((myMaxErrors != -1) && (myNumErrors >= myMaxErrors))
953  {
954  return(true);
955  }
956  return(false);
957 }
958 
959 
960 void FastQFile::printAvgQual()
961 {
962  std::cout << std::endl << "Average Phred Quality by Read Index (starts at 0):" << std::endl;
963  std::cout.precision(2);
964  std::cout << std::fixed << "Read Index\tAverage Quality"
965  << std::endl;
966  if(myQualPerCycle.size() != myCountPerCycle.size())
967  {
968  // This is a code error and should NEVER happen.
969  std::cerr << "ERROR calculating the average Qualities per cycle\n";
970  }
971 
972  double sumQual = 0;
973  double count = 0;
974  double avgQual = 0;
975  for(unsigned int i = 0; i < myQualPerCycle.size(); i++)
976  {
977  avgQual = 0;
978  if(myCountPerCycle[i] != 0)
979  {
980  avgQual = myQualPerCycle[i] / (double)(myCountPerCycle[i]);
981  }
982  std::cout << i << "\t" << avgQual << "\n";
983  sumQual += myQualPerCycle[i];
984  count += myCountPerCycle[i];
985  }
986  std::cout << std::endl;
987  avgQual = 0;
988  if(count != 0)
989  {
990  avgQual = sumQual / count;
991  }
992  std::cout << "Overall Average Phred Quality = " << avgQual << std::endl;
993 }
int ifeof(IFILE file)
Check to see if we have reached the EOF (returns 0 if not EOF).
Definition: InputFile.h:654
IFILE ifopen(const char *filename, const char *mode, InputFile::ifileCompression compressionMode=InputFile::DEFAULT)
Open a file with the specified name and mode, using a filename of "-" to indicate stdin/stdout.
Definition: InputFile.h:562
int ifclose(IFILE &file)
Close the file.
Definition: InputFile.h:580
SPACE_TYPE
The type of space (color or base) to use in the mapping.
Definition: BaseAsciiMap.h:44
bool updateComposition(unsigned int rawSequenceCharIndex, char baseChar)
Update the composition for the specified index with the specified character.
void clear()
Clear the composition stored in the base count vector.
void setBaseMapType(BaseAsciiMap::SPACE_TYPE spaceType)
Set the base map type for this composition.
void print()
Print the composition.
void resetBaseMapType()
Reset the base map type for this composition.
static uint8_t getPhredBaseQuality(char charQuality)
Get phred base quality from the specified ascii quality.
void interleaved()
Interleaved.
Definition: FastQFile.cpp:78
FastQStatus::Status openFile(const char *fileName, BaseAsciiMap::SPACE_TYPE spaceType=BaseAsciiMap::UNKNOWN)
Open a FastQFile.
Definition: FastQFile.cpp:92
void enableSeqIDCheck()
Enable Unique Sequence ID checking.
Definition: FastQFile.cpp:71
void disableMessages()
Disable messages - do not write to cout.
Definition: FastQFile.cpp:49
bool isOpen()
Check to see if the file is open.
Definition: FastQFile.cpp:162
void disableSeqIDCheck()
Disable Unique Sequence ID checking (Unique Sequence ID checking is enabled by default).
Definition: FastQFile.cpp:63
FastQStatus::Status readFastQSequence()
Read 1 FastQSequence, validating it.
Definition: FastQFile.cpp:309
FastQStatus::Status closeFile()
Close a FastQFile.
Definition: FastQFile.cpp:134
FastQStatus::Status validateFastQFile(const String &filename, bool printBaseComp, BaseAsciiMap::SPACE_TYPE spaceType, bool printQualAvg=false)
Validate the specified fastq file.
Definition: FastQFile.cpp:204
bool keepReadingFile()
Returns whether or not to keep reading the file, it stops reading (false) if eof or there is a proble...
Definition: FastQFile.cpp:193
void enableMessages()
Enable messages - write to cout.
Definition: FastQFile.cpp:55
void setMaxErrors(int maxErrors)
Set the number of errors after which to quit reading/validating a file, defaults to -1.
Definition: FastQFile.cpp:85
FastQFile(int minReadLength=10, int numPrintableErrors=20)
Constructor.
Definition: FastQFile.cpp:30
bool isEof()
Check to see if the file is at the end of the file.
Definition: FastQFile.cpp:177
Status
Return value enum for the FastQFile class methods, indicating success or error codes.
Definition: FastQStatus.h:31
@ FASTQ_ORDER_ERROR
means the methods are called out of order, like trying to read a file before opening it.
Definition: FastQStatus.h:34
@ FASTQ_READ_ERROR
means that a problem occurred on a read.
Definition: FastQStatus.h:37
@ FASTQ_SUCCESS
indicates method finished successfully.
Definition: FastQStatus.h:32
@ FASTQ_INVALID
means that the sequence was invalid.
Definition: FastQStatus.h:33
@ FASTQ_OPEN_ERROR
means the file could not be opened.
Definition: FastQStatus.h:35
@ FASTQ_NO_SEQUENCE_ERROR
means there were no errors, but no sequences read.
Definition: FastQStatus.h:38
@ FASTQ_CLOSE_ERROR
means the file could not be closed.
Definition: FastQStatus.h:36
bool isOpen() const
Returns whether or not the file was successfully opened.
Definition: InputFile.h:423