25 #include "GenomeSequence.h"
56 void PackedRead::set(
const char *rhs,
int padWithNCount)
61 while (padWithNCount>1)
63 packedBases.push_back(
77 packedBases.push_back(
92 assert(padWithNCount==0);
96 while (*rhs && *(rhs+1))
98 packedBases.push_back(
110 packedBases.push_back(
118 std::string GenomeSequence::IntegerToSeq(
unsigned int n,
unsigned int wordsize)
const
120 std::string sequence(
"");
121 for (
unsigned int i = 0; i < wordsize; i ++)
124 unsigned int clearHigherBits = ~(3U << (wordsize<<1));
126 if (n > clearHigherBits)
127 error(
"%d needs to be a non-negative integer < clearHigherBits\n", n);
129 for (
unsigned int i = 0; i < wordsize; i++)
144 void GenomeSequence::constructorClear()
147 _progressStream = NULL;
149 _createOverwrite =
false;
152 void GenomeSequence::setup(
const char *referenceFilename)
156 if (_progressStream) *_progressStream <<
"open and prefetch reference genome " << referenceFilename <<
": " << std::flush;
160 std::cerr <<
"Failed to open reference genome " << referenceFilename << std::endl;
161 std::cerr << errorStr << std::endl;
166 if (_progressStream) *_progressStream <<
"done." << std::endl << std::flush;
188 _umfaFilename = _baseFilename +
"-cs.umfa";
192 _umfaFilename = _baseFilename +
"-bs.umfa";
195 if(access(_umfaFilename.c_str(), R_OK) != 0)
201 std::cerr <<
"GenomeSequence::open: failed to open file "
203 <<
" also failed creating it."
212 std::cerr <<
"GenomeSequence::open: failed to open file "
218 _colorSpace = header->_colorSpace;
223 void GenomeSequence::sanityCheck(
MemoryMap &fasta)
const
227 unsigned int genomeIndex = 0;
228 for (i=0; i<fasta.length(); i++)
233 while (fasta[i]!=
'\n' && fasta[i]!=
'\r') i++;
247 #define HAS_SUFFIX(str, suffix) ((strlen(suffix) < str.size()) && (str.substr(str.size() - strlen(suffix)) == suffix))
257 if (HAS_SUFFIX(referenceFilename,
".fa"))
259 _referenceFilename = referenceFilename;
260 _baseFilename = _referenceFilename.substr(0, referenceFilename.size() - 3);
262 else if (HAS_SUFFIX(referenceFilename,
".umfa"))
264 _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 5);
266 else if (HAS_SUFFIX(referenceFilename,
"-cs.umfa"))
268 _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8);
270 else if (HAS_SUFFIX(referenceFilename,
"-bs.umfa"))
272 _baseFilename = referenceFilename.substr(0, referenceFilename.size() - 8);
276 _baseFilename = referenceFilename;
278 _fastaFilename = _baseFilename +
".fa";
280 if (HAS_SUFFIX(referenceFilename,
".fasta"))
282 _referenceFilename = referenceFilename;
283 _baseFilename = _referenceFilename.substr(0, referenceFilename.size() - 6);
284 _fastaFilename = _baseFilename +
".fasta";
299 bool GenomeSequence::setChromosomeMD5andLength(uint32_t whichChromosome)
301 if (whichChromosome>=header->_chromosomeCount)
return true;
304 c->size = header->elementCount - c->start;
307 uint8_t md5Signature[MD5_DIGEST_LENGTH];
313 char *md5Buffer = (
char *) malloc(c->size);
315 MD5Init(&md5Context);
317 for (genomeIndex_t i = 0; i < c->size; i ++)
319 md5Buffer[i] = (*this)[c->start + i];
321 MD5Update(&md5Context, (
unsigned char *) md5Buffer, c->size);
322 MD5Final((
unsigned char *) &md5Signature, &md5Context);
324 for (
int i=0; i<MD5_DIGEST_LENGTH; i++)
327 sprintf(c->md5+2*i,
"%02x", md5Signature[i]);
331 c->md5[2*MD5_DIGEST_LENGTH] =
'\0';
340 static bool getFastaStats(
const char *fastaData,
size_t fastaDataSize, uint32_t &chromosomeCount, uint64_t &baseCount)
344 bool atLineStart =
true;
350 for (
size_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++)
352 switch (fastaData[fastaIndex])
360 if (!atLineStart)
break;
365 while (fastaIndex < fastaDataSize && fastaData[fastaIndex]!=
'\n' && fastaData[fastaIndex]!=
'\r')
383 std::vector<uint8_t> m_packedBases;
387 void set(
size_t index, uint8_t value) {
388 m_packedBases[index>>1] =
389 (m_packedBases[index>>1]
390 & ~(7<<((index&0x01)<<2)))
391 | ((value&0x0f)<<((index&0x1)<<2));
396 void reserve(
size_t baseCount) {m_packedBases.reserve(baseCount/2);}
397 size_t size() {
return m_baseCount;}
398 void clear() {m_packedBases.clear(); m_baseCount = 0;}
399 uint8_t operator [](
size_t index)
401 return (m_packedBases[index>>1] >> ((index&0x1)<<2)) & 0xf;
403 void push_back(uint8_t base);
420 bool loadFastaFile(
const char *filename,
421 std::vector<PackedSequenceData> &sequenceData,
422 std::vector<std::string> &chromosomeNames)
426 if(!inputStream.isOpen()) {
427 std::cerr <<
"Failed to open file " << filename <<
"\n";
431 int whichChromosome = -1;
432 chromosomeNames.clear();
435 while((ch = inputStream.ifgetc()) != EOF) {
443 std::string chromosomeName =
"";
447 while (!isspace((ch = inputStream.ifgetc())) && ch != EOF)
449 chromosomeName += ch;
455 ch = inputStream.ifgetc();
456 }
while(ch != EOF && ch !=
'\n' && ch !=
'\r');
461 chromosomeNames.push_back(chromosomeName);
488 bool GenomeSequence::create(
bool isColor)
490 setColorSpace(isColor);
492 if (_baseFilename==
"")
494 std::cerr <<
"Base reference filename is empty." << std::endl;
500 _umfaFilename = _baseFilename +
"-cs.umfa";
504 _umfaFilename = _baseFilename +
"-bs.umfa";
507 if (!_createOverwrite && access(_umfaFilename.c_str(), R_OK) == 0)
509 std::cerr <<
"Output file '" << _umfaFilename <<
"' exists or is not writable - please remove." << std::endl;
515 if (fastaFile.
open(_fastaFilename.c_str()))
517 std::cerr <<
"failed to open input fasta file '" << _fastaFilename <<
"'." << std::endl;
521 std::cerr <<
"Creating FASTA "
523 <<
"binary cache file '"
528 std::cerr << std::flush;
534 const char *fasta = (
const char *) fastaFile.data;
535 size_t fastaDataSize = fastaFile.length();
537 uint32_t chromosomeCount = 0;
538 uint64_t baseCount = 0;
539 getFastaStats(fasta, fastaDataSize, chromosomeCount, baseCount);
543 std::cerr <<
"failed to create '"
550 header->elementCount = 0;
552 header->setApplication(_application.c_str());
553 header->_chromosomeCount = chromosomeCount;
557 for (uint32_t i=0; i<header->_chromosomeCount; i++) header->_chromosomes[i].constructorClear();
559 std::string chromosomeName;
564 bool terminateLoad =
false;
565 int percent = -1, newPercent;
566 uint32_t whichChromosome = 0;
567 for (uint64_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++)
571 newPercent = (int) (1.0 * fastaIndex / fastaDataSize) * 100;
572 if (newPercent>percent)
574 *_progressStream <<
"\r" << newPercent <<
"% ";
575 *_progressStream << std::flush;
576 percent = newPercent;
579 switch (fasta[fastaIndex])
591 while (!isspace(fasta[fastaIndex]))
593 chromosomeName += fasta[fastaIndex++];
598 while (fasta[fastaIndex]!=
'\n' && fasta[fastaIndex]!=
'\r')
607 c->setChromosomeName(chromosomeName.c_str());
608 c->start = header->elementCount;
611 if (whichChromosome>0)
619 setChromosomeMD5andLength(whichChromosome - 1);
622 if (whichChromosome > header->_chromosomeCount)
624 std::cerr <<
"BUG: Exceeded computed chromosome count ("
625 << header->_chromosomeCount
626 <<
") - genome is now truncated at chromosome "
627 << header->_chromosomes[header->_chromosomeCount-1].name
629 << header->_chromosomeCount
632 terminateLoad =
true;
647 const char fromBase2CS[] =
679 if (lastBase>3 || thisBase>3) color=4;
680 else color = fromBase2CS[(int)(lastBase<<2 | thisBase)];
683 set(header->elementCount++,
690 set(header->elementCount++, toupper(fasta[fastaIndex]));
699 if (terminateLoad)
break;
706 if (whichChromosome==0)
709 throw std::runtime_error(
"No chromosomes found - aborting!");
713 setChromosomeMD5andLength(whichChromosome-1);
718 if (_progressStream) *_progressStream <<
"\r";
720 std::cerr <<
"FASTA binary cache file '"
733 return header->_chromosomeCount;
739 if (position == INVALID_GENOME_INDEX)
return INVALID_CHROMOSOME_INDEX;
741 if (header->_chromosomeCount == 0)
742 return INVALID_CHROMOSOME_INDEX;
745 int stop = header->_chromosomeCount - 1;
750 if (position > header->_chromosomes[stop].start)
753 while (start <= stop)
755 int middle = (start + stop) / 2;
757 if (position >= header->_chromosomes[middle].start && position < header->_chromosomes[middle + 1].start)
760 if (position == header->_chromosomes[middle + 1].start)
763 if (position > header->_chromosomes[middle + 1].start)
766 if (position < header->_chromosomes[middle].start)
780 const char *chromosomeName,
781 unsigned int chromosomeIndex)
const
784 if (i == INVALID_GENOME_INDEX)
return INVALID_GENOME_INDEX;
785 return i + chromosomeIndex - 1;
790 unsigned int chromosomeIndex)
const
792 if (chromosome<0 || chromosome >= (
int) header->_chromosomeCount)
return INVALID_GENOME_INDEX;
794 genomeIndex_t i = header->_chromosomes[chromosome].start;
795 if (i == INVALID_GENOME_INDEX)
return INVALID_GENOME_INDEX;
796 return i + chromosomeIndex - 1;
810 if (chromosome==INVALID_CHROMOSOME_INDEX)
return INVALID_GENOME_INDEX;
811 return header->_chromosomes[chromosome].start;
817 for (i=0; i<header->_chromosomeCount; i++)
819 if (strcmp(header->_chromosomes[i].name, chromosomeName)==0)
824 return INVALID_CHROMOSOME_INDEX;
831 void GenomeSequence::getReverseRead(std::string &read)
834 if (read.size())
for (int32_t i=(
int) read.size() - 1; i>=0; i--)
836 newRead.push_back(BasePair(read[i]));
841 void GenomeSequence::getReverseRead(
String& read)
844 int j = read.Length()-1;
854 #define ABS(x) ( (x) > 0 ? (x) : -(x) )
855 int GenomeSequence::debugPrintReadValidation(
857 std::string &quality,
859 genomeIndex_t readLocation,
865 int validateSumQ = 0;
866 int validateMismatchCount = 0;
868 std::string genomeData;
870 for (uint32_t i=0; i<read.size(); i++)
872 if (tolower(read[i]) != tolower((*
this)[readLocation + i]))
874 validateSumQ += quality[i] -
'!';
876 if (direction==
'F' ? i<24 : (i >= (read.size() - 24))) validateMismatchCount++;
877 genomeData.push_back(tolower((*
this)[readLocation + i]));
881 genomeData.push_back(toupper((*
this)[readLocation + i]));
884 assert(validateSumQ>=0);
885 if (validateSumQ != sumQuality && validateMismatchCount == mismatchCount)
887 printf(
"SUMQ: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d\n",
895 else if (validateSumQ == sumQuality && validateMismatchCount != mismatchCount)
897 printf(
"MISM: Original Genome: %s test read: %s : actual mismatch %d test mismatches %d\n",
900 validateMismatchCount,
905 else if (validateSumQ != sumQuality && validateMismatchCount != mismatchCount)
907 printf(
"BOTH: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d, actual mismatch %d test mismatches %d\n",
912 validateMismatchCount,
918 if (recurse && ABS(validateMismatchCount - mismatchCount) > (
int) read.size()/2)
920 printf(
"large mismatch difference, trying reverse strand: ");
921 std::string reverseRead = read;
922 std::string reverseQuality = quality;
923 getReverseRead(reverseRead);
924 reverse(reverseQuality.begin(), reverseQuality.end());
925 rc = debugPrintReadValidation(reverseRead, reverseQuality, readLocation, sumQuality, mismatchCount,
false);
932 bool GenomeSequence::wordMatch(
unsigned int index, std::string &word)
const
934 for (uint32_t i = 0; i<word.size(); i++)
936 if ((*
this)[index + i] != word[i])
return false;
941 bool GenomeSequence::printNearbyWords(
unsigned int index,
unsigned int deviation, std::string &word)
const
943 for (
unsigned int i = index - deviation; i < index + deviation; i++)
945 if (wordMatch(i, word))
947 std::cerr <<
"word '"
951 <<
" away from position "
960 void GenomeSequence::dumpSequenceSAMDictionary(std::ostream &file)
const
962 for (
unsigned int i=0; i<header->_chromosomeCount; i++)
966 <<
"\tSN:" << header->_chromosomes[i].name
967 <<
"\tLN:" << header->_chromosomes[i].size
968 <<
"\tAS:" << header->_chromosomes[i].assemblyID
969 <<
"\tM5:" << header->_chromosomes[i].md5
970 <<
"\tUR:" << header->_chromosomes[i].uri
971 <<
"\tSP:" << header->_chromosomes[i].species
976 void GenomeSequence::dumpHeaderTSV(std::ostream &file)
const
978 file <<
"# Reference: " << _baseFilename << std::endl;
979 file <<
"# SN: sample name - must be unique" << std::endl;
980 file <<
"# AS: assembly name" << std::endl;
981 file <<
"# SP: species" << std::endl;
982 file <<
"# LN: chromosome/contig length" << std::endl;
983 file <<
"# M5: chromosome/contig MD5 checksum" << std::endl;
984 file <<
"# LN and M5 are only printed for informational purposes." << std::endl;
985 file <<
"# Karma will only set those values when creating the index." << std::endl;
986 file <<
"SN" <<
"\t" <<
"AS" <<
"\t" <<
"SP" <<
"\t" <<
"UR" <<
"\t" <<
"LN" <<
"\t" <<
"M5" << std::endl;
987 for (
unsigned int i=0; i<header->_chromosomeCount; i++)
990 << header->_chromosomes[i].name
991 <<
"\t" << header->_chromosomes[i].assemblyID
992 <<
"\t" << header->_chromosomes[i].uri
993 <<
"\t" << header->_chromosomes[i].species
994 <<
"\t" << header->_chromosomes[i].size
995 <<
"\t" << header->_chromosomes[i].md5
1001 void GenomeSequence::getString(std::string &str,
int chromosome, uint32_t index,
int baseCount)
const
1006 genomeIndex_t genomeIndex = header->_chromosomes[chromosome].start + index - 1;
1008 getString(str, genomeIndex, baseCount);
1011 void GenomeSequence::getString(
String &str,
int chromosome, uint32_t index,
int baseCount)
const
1014 this-> getString(
string, chromosome, index, baseCount);
1015 str =
string.c_str();
1018 void GenomeSequence::getString(std::string &str, genomeIndex_t index,
int baseCount)
const
1023 for (
int i=0; i<baseCount; i++)
1025 str.push_back((*
this)[index + i]);
1032 for (
int i=0; i< -baseCount; i++)
1039 void GenomeSequence::getString(
String &str, genomeIndex_t index,
int baseCount)
const
1042 getString(
string, index, baseCount);
1043 str =
string.c_str();
1046 void GenomeSequence::getHighLightedString(std::string &str, genomeIndex_t index,
int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd)
const
1051 for (
int i=0; i<baseCount; i++)
1053 char base = (*this)[index + i];
1054 if (in(index+i, highLightStart, highLightEnd))
1055 base = tolower(base);
1056 str.push_back(base);
1063 for (
int i=0; i< -baseCount; i++)
1066 if (in(index+i, highLightStart, highLightEnd))
1067 base = tolower(base);
1068 str.push_back(base);
1073 void GenomeSequence::print30(genomeIndex_t index)
const
1075 std::cout <<
"index: " << index <<
"\n";
1076 for (genomeIndex_t i=index-30; i<index+30; i++)
1077 std::cout << (*
this)[i];
1079 for (genomeIndex_t i=index-30; i<index; i++)
1082 std::cout << std::endl;
1085 void GenomeSequence::getMismatchHatString(std::string &result,
const std::string &read, genomeIndex_t location)
const
1088 for (uint32_t i=0; i < read.size(); i++)
1090 if (read[i] == (*
this)[location+i])
1091 result.push_back(
' ');
1093 result.push_back(
'^');
1097 void GenomeSequence::getMismatchString(std::string &result,
const std::string &read, genomeIndex_t location)
const
1100 for (uint32_t i=0; i < read.size(); i++)
1102 if (read[i] == (*
this)[location+i])
1103 result.push_back(toupper(read[i]));
1105 result.push_back(tolower(read[i]));
1109 genomeIndex_t GenomeSequence::simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index,
int windowSize)
const
1111 int bestScore = 1000000;
1112 genomeIndex_t bestMatchLocation = INVALID_GENOME_INDEX;
1113 for (
int i=-windowSize; i<windowSize; i++)
1117 if (i<0 && ((uint32_t) -i) > index)
continue;
1126 newScore = this->
getSumQ(read, quality, index + i);
1128 if (newScore < bestScore)
1130 bestScore = newScore;
1131 bestMatchLocation = index + i;
1134 return bestMatchLocation;
1140 stream <<
"chromosomeCount: " << h._chromosomeCount << std::endl;
1141 stream <<
"isColorSpace: " << h._colorSpace << std::endl;
1142 stream <<
"chromosomeCount: " << h._chromosomeCount << std::endl;
1143 uint64_t totalSize = 0;
1144 for (uint32_t i=0; i < h._chromosomeCount; i++)
1146 totalSize += h._chromosomes[i].size;
1147 stream <<
"Chromosome Index " << i <<
" name: " << h._chromosomes[i].name << std::endl;
1148 stream <<
"Chromosome Index " << i <<
" whole genome start: " << h._chromosomes[i].start << std::endl;
1149 stream <<
"Chromosome Index " << i <<
" whole genome size: " << h._chromosomes[i].size << std::endl;
1150 stream <<
"Chromosome Index " << i <<
" md5 checksum: " << h._chromosomes[i].md5 << std::endl;
1151 stream <<
"Chromosome Index " << i <<
" assemblyID: " << h._chromosomes[i].assemblyID << std::endl;
1152 stream <<
"Chromosome Index " << i <<
" species: " << h._chromosomes[i].species << std::endl;
1153 stream <<
"Chromosome Index " << i <<
" URI: " << h._chromosomes[i].uri << std::endl;
1155 stream <<
"Total Genome Size: " << totalSize <<
" bases."<< std::endl;
1156 if (totalSize != h.elementCount)
1158 stream <<
"Total Genome Size: does not match elementCount!\n";
1161 stream << std::endl;
1165 void GenomeSequence::getChromosomeAndIndex(std::string &s, genomeIndex_t i)
const
1167 int whichChromosome = 0;
1171 if (whichChromosome == INVALID_CHROMOSOME_INDEX)
1173 s =
"invalid genome index";
1177 std::ostringstream buf;
1179 buf << header->_chromosomes[whichChromosome].name <<
":" << chromosomeIndex;
1181 buf <<
" (GenomeIndex " << i <<
")";
1189 void GenomeSequence::getChromosomeAndIndex(
String& s, genomeIndex_t i)
const
1192 getChromosomeAndIndex(ss, i);
1213 bool GenomeSequence::populateDBSNP(
1215 IFILE inputFile)
const
1219 if(inputFile == NULL)
1225 std::string chromosomeName;
1226 std::string position;
1227 genomeIndex_t chromosomePosition1;
1228 uint64_t ignoredLineCount = 0;
1231 char* postPosPtr = NULL;
1232 while(!inputFile->
ifeof())
1234 chromosomeName.clear();
1237 if(inputFile->
readTilTab(chromosomeName) <= 0)
1241 if(chromosomeName.size()>0 && chromosomeName[0]==
'#')
1253 if(chromosomeName.size()>0 && chromosomeName[0]==
'#')
1269 chromosomePosition1 = strtoul(position.c_str(), &postPosPtr, 0);
1272 if(postPosPtr == position.c_str())
1279 genomeIndex_t genomeIndex =
1283 if((genomeIndex == INVALID_GENOME_INDEX) ||
1290 dbSNP.set(genomeIndex,
true);
1293 if (ignoredLineCount > 0)
1295 std::cerr <<
"GenomeSequence::populateDBSNP: ignored " << ignoredLineCount <<
" SNP positions due to invalid format of line." << std::endl;
1303 const char *inputFileName)
const
1319 if (strlen(inputFileName)!=0)
1321 std::cerr <<
"Load dbSNP file '" << inputFileName <<
"': " << std::flush;
1323 if (dbSNP.
open(inputFileName, O_RDONLY))
1331 if (dbSNP.getErrorString().find(
"wrong type of file")==std::string::npos)
1333 std::cerr <<
"Error: " << dbSNP.getErrorString() << std::endl;
1340 if(inputFile == NULL)
1342 std::cerr <<
"Error: failed to open " << inputFileName << std::endl;
1346 std::cerr <<
"(as text file) ";
1352 populateDBSNP(dbSNP, inputFile);
1358 std::cerr <<
"(as binary mapped file) ";
1361 std::cerr <<
"DONE!" << std::endl;
1373 void simplestExample(
void)
1376 genomeIndex_t index;
1384 if (reference.
open())
1386 perror(
"GenomeSequence::open");
1401 std::cout <<
"base[" << index <<
"] = " << reference[index] << std::endl;
1410 std::cout <<
"genome index " << index <<
" corresponds to chromosome " << chr <<
" position " << chrIndex << std::endl;
1416 const char *chromosomeName =
"5";
1422 std::cout <<
"Chromosome '" << chromosomeName <<
"' position " << chrIndex <<
" corresponds to genome index position " << index << std::endl;
1427 void testGenomeSequence(
void)
1432 std::string referenceName =
"someotherreference";
1433 if (reference.setFastaName(referenceName))
1435 std::cerr <<
"failed to open reference file "
1442 std::cerr <<
"open and prefetch the reference genome: ";
1445 if (reference.
open())
1449 std::cerr <<
"done!" << std::endl;
1454 genomeIndex_t genomeIndex;
1455 unsigned int chromosomeIndex;
1456 unsigned int chromosome;
1457 std::string chromosomeName;
1463 chromosomeName =
"2";
1464 chromosomeIndex = 1234567;
1466 genomeIndex = reference.
getGenomePosition(chromosomeName.c_str(), chromosomeIndex);
1467 assert(genomeIndex!=INVALID_GENOME_INDEX);
1468 std::cout <<
"Chromosome " << chromosomeName <<
", index ";
1469 std::cout << chromosomeIndex <<
" contains base " << reference[genomeIndex];
1470 std::cout <<
" at genome index position " << genomeIndex << std::endl;
1479 unsigned int newChromosomeIndex;
1481 newChromosomeIndex = genomeIndex - reference.getChromosomeStart(chromosome) + 1;
1483 assert(chromosomeIndex == newChromosomeIndex);
1489 pr.set(
"ATCGATCG", 0);
1490 assert(pr.size()==8);
1495 pr.set(
"ATCGATCG", 1);
1496 assert(pr.size()==9);
1498 assert(pr.size()==0);
1500 assert(pr.size()==1);
1502 assert(pr.size()==2);
1504 assert(pr.size()==3);
1509 assert(pr.size()==2);
1522 int main(
int argc,
const char **argv)
1527 testGenomeSequence();