libStatGen Software  1
IndexBase.cpp
1 /*
2  * Copyright (C) 2010-2012 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include "IndexBase.h"
19 #include <iomanip>
20 
21 Chunk SortedChunkList::pop()
22 {
23  Chunk newChunk = chunkList.begin()->second;
24  chunkList.erase(chunkList.begin());
25  return(newChunk);
26 }
27 
28 
29 bool SortedChunkList::insert(const Chunk& chunkToInsert)
30 {
31  std::pair<std::map<uint64_t, Chunk>::iterator, bool> insertRes;
32  // Insert the passed in chunk.
33  insertRes =
34  chunkList.insert(std::pair<uint64_t, Chunk>(chunkToInsert.chunk_beg,
35  chunkToInsert));
36 
37  if(!insertRes.second)
38  {
39  // Failed to insert the chunk.
40  std::cerr << "Failed to insert into the SortedChunkList.\n";
41  std::cerr << "\tpreviously found chunk:\tbeg = " << std::hex
42  << insertRes.first->second.chunk_beg
43  << "\tend = "
44  << insertRes.first->second.chunk_end
45  << "\nnew chunk:\tbeg = " << std::hex
46  << chunkToInsert.chunk_beg
47  << "\tend = "
48  << chunkToInsert.chunk_end
49  << std::endl;
50  }
51  // return the result that comes from insertRes.
52  return(insertRes.second);
53 }
54 
55 void SortedChunkList::clear()
56 {
57  chunkList.clear();
58 }
59 
60 bool SortedChunkList::empty()
61 {
62  return(chunkList.empty());
63 }
64 
65 
66 // Merge overlapping chunks found in this list.
67 bool SortedChunkList::mergeOverlapping()
68 {
69  // Start at the beginning of the list and iterate through.
70  std::map<uint64_t, Chunk>::iterator currentPos = chunkList.begin();
71  std::map<uint64_t, Chunk>::iterator nextPos = chunkList.begin();
72  if(nextPos != chunkList.end())
73  {
74  ++nextPos;
75  }
76 
77  // Loop until the end is reached.
78  while(nextPos != chunkList.end())
79  {
80  // If the next chunk is completely contained within the current
81  // chunk (its end is less than the current chunk's end),
82  // delete it since its position is already covered.
83  if(nextPos->second.chunk_end < currentPos->second.chunk_end)
84  {
85  chunkList.erase(nextPos);
86  nextPos = currentPos;
87  ++nextPos;
88  continue;
89  }
90 
91  // If the next chunk's start position's BGZF block is less than or
92  // equal to the BGZF block of the current chunk's end position,
93  // combine the two chunks into the current chunk.
94  if((nextPos->second.chunk_beg >> 16) <=
95  (currentPos->second.chunk_end >> 16))
96  {
97  currentPos->second.chunk_end = nextPos->second.chunk_end;
98  // nextPos has now been included in the current pos, so
99  // remove it.
100  chunkList.erase(nextPos);
101  nextPos = currentPos;
102  ++nextPos;
103  continue;
104  }
105  else
106  {
107  // Nothing to combine. So try combining at the next
108  currentPos = nextPos;
109  ++nextPos;
110  }
111  }
112  return(true);
113 }
114 
115 
116 IndexBase::IndexBase()
117  : n_ref(0)
118 {
119  myRefs.clear();
120 }
121 
122 
123 
124 IndexBase::~IndexBase()
125 {
126 }
127 
128 
129 // Reset the member data for a new index file.
131 {
132  n_ref = 0;
133  // Clear the references.
134  myRefs.clear();
135 }
136 
137 
138 // Get the number of references in this index.
139 int32_t IndexBase::getNumRefs() const
140 {
141  // Return the number of references.
142  return(myRefs.size());
143 }
144 
145 
146 // The basic logic is from samtools reg2bins and the samtools format specification pdf.
147 // Set bins in the region to 1 and all other bins to 0.
148 void IndexBase::getBinsForRegion(uint32_t start, uint32_t end, bool binMap[MAX_NUM_BINS+1])
149 {
150  for(uint32_t index = 0; index < MAX_NUM_BINS+1; index++)
151  {
152  binMap[index] = false;
153  }
154 
155  uint32_t binNum = 0;
156  --end;
157 
158  // Check if beg/end go too high, set to max position.
159  if(start > MAX_POSITION)
160  {
161  start = MAX_POSITION;
162  }
163  if(end > MAX_POSITION)
164  {
165  end = MAX_POSITION;
166  }
167 
168  // Turn on bins.
169  binMap[binNum] = true;
170  for (binNum = 1 + (start>>26); binNum <= 1 + (end>>26); ++binNum)
171  binMap[binNum] = true;
172  for (binNum = 9 + (start>>23); binNum <= 9 + (end>>23); ++binNum)
173  binMap[binNum] = true;
174  for (binNum = 73 + (start>>20); binNum <= 73 + (end>>20); ++binNum)
175  binMap[binNum] = true;
176  for (binNum = 585 + (start>>17); binNum <= 585 + (end>>17); ++binNum)
177  binMap[binNum] = true;
178  for (binNum = 4681 + (start>>14); binNum <= 4681 + (end>>14); ++binNum)
179  binMap[binNum] = true;
180 }
181 
182 
183 // Returns the minimum offset of records that cross the 16K block that
184 // contains the specified position for the given reference id.
185 bool IndexBase::getMinOffsetFromLinearIndex(int32_t refID, uint32_t position,
186  uint64_t& minOffset) const
187 {
188  int32_t linearIndex = position >> LINEAR_INDEX_SHIFT;
189 
190  minOffset = 0;
191 
192  if(refID > n_ref)
193  {
194  // out of range of the references, return false.
195  return(false);
196  }
197  // Check to see if the position is out of range of the linear index.
198  int32_t linearOffsetSize = myRefs[refID].n_intv;
199 
200  // If there are no entries in the linear index, return false.
201  // Or if the linear index is not large enough to include
202  // the start block, then there can be no records that cross
203  // our region, so return false.
204  if((linearOffsetSize == 0) || (linearIndex >= linearOffsetSize))
205 
206  {
207  return(false);
208  }
209 
210  // The linear index is specified for this block, so return that value.
211  minOffset = myRefs[refID].ioffsets[linearIndex];
212 
213  // If the offset is 0, go to the previous block that has an offset.
214  // This is due to a couple of bugs in older sam tools indexes.
215  // 1) they add one to the index location (so when reading those, you
216  // may be starting earlier than necessary)
217  // 2) (the bigger issue) They did not include bins 4681-37449 in
218  // the linear index.
219  while((minOffset == 0) && (--linearIndex >= 0))
220  {
221  minOffset = myRefs[refID].ioffsets[linearIndex];
222  }
223 
224 
225  // If the minOffset is still 0 when moving forward,
226  // check later indices to find a non-zero since we don't want to return
227  // an offset of 0 since the record can't start at 0 we want to at least
228  // return the first record position for this reference.
229  linearIndex = 0;
230  while((minOffset == 0) && (linearIndex < linearOffsetSize))
231  {
232  minOffset = myRefs[refID].ioffsets[linearIndex];
233  linearIndex++;
234  }
235  if(minOffset == 0)
236  {
237  // Could not find a valid start position for this reference.
238  return(false);
239  }
240  return(true);
241 }
int32_t getNumRefs() const
Get the number of references in this index.
Definition: IndexBase.cpp:139
virtual void resetIndex()
Reset the member data for a new index file.
Definition: IndexBase.cpp:130