18 #ifndef _GENOME_SEQUENCE_H
19 #define _GENOME_SEQUENCE_H
21 #include <sys/types.h>
23 #if !defined(MD5_DIGEST_LENGTH)
24 #define MD5_DIGEST_LENGTH 16
27 #include "MemoryMapArray.h"
28 #include "BaseAsciiMap.h"
31 #include "StringArray.h"
38 #define UINT32_MAX 0xFFFFFFFF
41 typedef uint32_t genomeIndex_t;
42 #define INVALID_GENOME_INDEX UINT32_MAX
45 #define INVALID_CHROMOSOME_INDEX -1
47 #include "GenomeSequenceHelpers.h"
49 #define UMFA_COOKIE 0x1b7933a1 // unique cookie id
50 #define UMFA_VERSION 20100401U // YYYYMMDD of last change to file layout
59 Packed4BitElementCount2Bytes,
104 std::ostream *_progressStream;
108 bool _createOverwrite;
110 std::string _baseFilename;
111 std::string _referenceFilename;
112 std::string _fastaFilename;
113 std::string _umfaFilename;
114 std::string _application;
118 void setup(
const char *referenceFilename);
123 void constructorClear();
131 setup(referenceFilename.c_str());
141 setup(referenceFilename);
159 bool open(
const char *filename,
int flags = O_RDONLY)
161 _umfaFilename = filename;
167 bool _searchCommonFileSuffix;
169 bool create(
bool isColor =
false);
176 void setColorSpace(
bool colorSpace) {_colorSpace = colorSpace; }
177 void setSearchCommonFileSuffix(
bool searchCommonFileSuffix) {_searchCommonFileSuffix = searchCommonFileSuffix;}
180 void setCreateOverwrite(
bool createOverwrite) {_createOverwrite = createOverwrite;}
182 bool loadFastaData(
const char *filename);
196 _application = application;
198 const std::string &getFastaName()
const
200 return _fastaFilename;
202 const std::string &getReferenceName()
const
204 return _referenceFilename;
218 return getElementCount();
248 if (chromosomeIndex==INVALID_CHROMOSOME_INDEX)
return INVALID_GENOME_INDEX;
249 return header->_chromosomes[chromosomeIndex].start;
258 if (chromosomeIndex==INVALID_CHROMOSOME_INDEX)
return 0;
259 return header->_chromosomes[chromosomeIndex].size;
268 const char *chromosomeName,
269 unsigned int chromosomeIndex)
const;
278 unsigned int chromosomeIndex)
const;
285 const std::string &getBaseFilename()
const
287 return _baseFilename;
290 const char *getChromosomeName(
int chromosomeIndex)
const
292 return header->_chromosomes[chromosomeIndex].name;
295 void setDebugFlag(
bool d)
300 genomeIndex_t sequenceLength()
const
302 return (genomeIndex_t) header->elementCount;
305 const char *chromosomeName(
int chr)
const
307 return header->_chromosomes[chr].name;
310 void sanityCheck(
MemoryMap &fasta)
const;
313 std::string IntegerToSeq(
unsigned int n,
unsigned int wordsize)
const;
315 bool wordMatch(
unsigned int index, std::string &word)
const;
316 bool printNearbyWords(
unsigned int index,
unsigned int variance, std::string &word)
const;
319 char BasePair(
char c)
const
324 void dumpSequenceSAMDictionary(std::ostream&)
const;
325 void dumpHeaderTSV(std::ostream&)
const;
355 inline char operator[](genomeIndex_t index)
const
368 val = ((uint8_t *) data)[index>>1] & 0xf;
372 val = (((uint8_t *) data)[index>>1] & 0xf0) >> 4;
388 inline char getBase(
const char *chromosomeName,
389 unsigned int chromosomeIndex)
const
391 genomeIndex_t index =
393 if(index == INVALID_GENOME_INDEX)
398 return((*
this)[index]);
402 inline uint8_t getInteger(genomeIndex_t index)
const
407 inline void set(genomeIndex_t index,
char value)
409 genomeSequenceArray::set(index,
424 return ((uint8_t *) data + index/2);
432 bool setChromosomeMD5andLength(uint32_t whichChromosome);
438 void getReverseRead(std::string &read);
441 void getReverseRead(
String& read);
444 int debugPrintReadValidation(
446 std::string &quality,
448 genomeIndex_t readLocation,
458 void getString(std::string &str,
int chromosome, uint32_t index,
int baseCount)
const;
459 void getString(
String &str,
int chromosome, uint32_t index,
int baseCount)
const;
465 void getString(std::string &str, genomeIndex_t index,
int baseCount)
const;
466 void getString(
String &str, genomeIndex_t index,
int baseCount)
const;
468 void getHighLightedString(std::string &str, genomeIndex_t index,
int baseCount, genomeIndex_t highLightStart, genomeIndex_t highLightEnd)
const;
470 void print30(genomeIndex_t)
const;
473 genomeIndex_t simpleLocalAligner(std::string &read, std::string &quality, genomeIndex_t index,
int windowSize)
const;
490 int mismatchCount = 0;
491 for (uint32_t i=0; i<read.size(); i++)
492 if (read[i]!=exclude) mismatchCount += read[i]!=(*this)[location + i];
493 return mismatchCount;
501 int getSumQ(std::string &read, std::string &qualities, genomeIndex_t location)
const
504 for (uint32_t i=0; i<read.size(); i++)
505 sumQ += (read[i]!=(*
this)[location + i] ? (qualities[i]-33) : 0);
509 void getMismatchHatString(std::string &result,
const std::string &read, genomeIndex_t location)
const;
510 void getMismatchString(std::string &result,
const std::string &read, genomeIndex_t location)
const;
514 void getChromosomeAndIndex(std::string &, genomeIndex_t)
const;
515 void getChromosomeAndIndex(
String &, genomeIndex_t)
const;
530 std::string &qualities,