libStatGen Software  1
CigarHelper.cpp
1 /*
2  * Copyright (C) 2011 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 //////////////////////////////////////////////////////////////////////////
19 
20 #include "CigarHelper.h"
21 
22 // Soft Clip from the beginning of the read to the specified reference position.
24  int32_t refPosition0Based,
25  CigarRoller& newCigar,
26  int32_t &new0BasedPosition)
27 {
28  newCigar.clear();
29  Cigar* cigar = record.getCigarInfo();
30  if(cigar == NULL)
31  {
32  // Failed to get the cigar.
33  ErrorHandler::handleError("Soft clipping, but failed to read the cigar");
34  return(NO_CLIP);
35  }
36 
37  // No cigar or position in the record, so return no clip.
38  if((cigar->size() == 0) || (record.get0BasedPosition() == -1))
39  {
40  return(NO_CLIP);
41  }
42 
43  // Check to see if the reference position occurs before the record starts,
44  // if it does, do no clipping.
45  if(refPosition0Based < record.get0BasedPosition())
46  {
47  // Not within this read, so nothing to clip.
48  newCigar.Set(record.getCigar());
49  return(NO_CLIP);
50  }
51 
52  // The position falls after the read starts, so loop through until the
53  // position or the end of the read is found.
54  int32_t readClipPosition = 0;
55  bool clipWritten = false;
56  new0BasedPosition = record.get0BasedPosition();
57  for(int i = 0; i < cigar->size(); i++)
58  {
59  const Cigar::CigarOperator* op = &(cigar->getOperator(i));
60 
61  if(clipWritten)
62  {
63  // Clip point has been found, so just add everything.
64  newCigar += *op;
65  // Go to the next operation.
66  continue;
67  }
68 
69  // The clip point has not yet been found, so check to see if we found
70  // it now.
71 
72  // Not a clip, check to see if the operation is found in the
73  // reference.
75  {
76  // match, mismatch, deletion, skip
77 
78  // increment the current reference position to just past this
79  // operation.
80  new0BasedPosition += op->count;
81 
82  // Check to see if this is also in the query, because otherwise
83  // the operation is still being consumed.
84  if(Cigar::foundInQuery(*op))
85  {
86  // Also in the query, determine if the entire thing should
87  // be clipped or just part of it.
88 
89  uint32_t numKeep = 0;
90  // Check to see if we have hit our clip position.
91  if(refPosition0Based < new0BasedPosition)
92  {
93  // The specified clip position is in this cigar operation.
94  numKeep = new0BasedPosition - refPosition0Based - 1;
95 
96  if(numKeep > op->count)
97  {
98  // Keep the entire read. This happens because
99  // we keep reading until the first match/mismatch
100  // after the clip.
101  numKeep = op->count;
102  }
103  }
104 
105  // Add the part of this operation that is being clipped
106  // to the clip count.
107  readClipPosition += (op->count - numKeep);
108 
109  // Only write the clip if we found a match/mismatch
110  // to write. Otherwise we will keep accumulating clips
111  // for the case of insertions.
112  if(numKeep > 0)
113  {
114  new0BasedPosition -= numKeep;
115 
116  newCigar.Add(Cigar::softClip, readClipPosition);
117 
118  // Add the clipped part of this cigar to the clip
119  // position.
120  newCigar.Add(op->operation, numKeep);
121 
122  // Found a match after the clip point, so stop
123  // consuming cigar operations.
124  clipWritten = true;
125  continue;
126  }
127  }
128  }
129  else
130  {
131  // Only add hard clips. The softclips will be added in
132  // when the total number is found.
133  if(op->operation == Cigar::hardClip)
134  {
135  // Check if this is the first operation, if so, just write it.
136  if(i == 0)
137  {
138  newCigar += *op;
139  }
140  // Check if it is the last operation (otherwise skip it).
141  else if(i == (cigar->size() - 1))
142  {
143  // Check whether or not the clip was ever written, and if
144  // not, write it.
145  if(clipWritten == false)
146  {
147  newCigar.Add(Cigar::softClip, readClipPosition);
148  // Since no match/mismatch was ever found, set
149  // the new ref position to the original one.
150  new0BasedPosition = record.get0BasedPosition();
151  clipWritten = true;
152  }
153  // Add the hard clip.
154  newCigar += *op;
155  }
156  }
157  // Not yet to the clip position, so do not add this operation.
158  if(Cigar::foundInQuery(*op))
159  {
160  // Found in the query, so update the read clip position.
161  readClipPosition += op->count;
162  }
163  }
164  } // End loop through cigar.
165 
166 
167  // Check whether or not the clip was ever written, and if
168  // not, write it.
169  if(clipWritten == false)
170  {
171  newCigar.Add(Cigar::softClip, readClipPosition);
172  // Since no match/mismatch was ever found, set
173  // the new ref position to the original one.
174  new0BasedPosition = record.get0BasedPosition();
175  }
176 
177  // Subtract 1 since readClipPosition atually contains the first 0based
178  // position that is not clipped.
179  return(readClipPosition - 1);
180 }
181 
182 
183 // Soft Clip from the end of the read at the specified reference position.
185  int32_t refPosition0Based,
186  CigarRoller& newCigar)
187 {
188  newCigar.clear();
189  Cigar* cigar = record.getCigarInfo();
190  if(cigar == NULL)
191  {
192  // Failed to get the cigar.
193  ErrorHandler::handleError("Soft clipping, but failed to read the cigar");
194  return(NO_CLIP);
195  }
196 
197  // No cigar or position in the record, so return no clip.
198  if((cigar->size() == 0) || (record.get0BasedPosition() == -1))
199  {
200  return(NO_CLIP);
201  }
202 
203  // Check to see if the reference position occurs after the record ends,
204  // if so, do no clipping.
205  if(refPosition0Based > record.get0BasedAlignmentEnd())
206  {
207  // Not within this read, so nothing to clip.
208  newCigar.Set(record.getCigar());
209  return(NO_CLIP);
210  }
211 
212  // The position falls before the read ends, so loop through until the
213  // position is found.
214  int32_t currentRefPosition = record.get0BasedPosition();
215  int32_t readClipPosition = 0;
216  for(int i = 0; i < cigar->size(); i++)
217  {
218  const Cigar::CigarOperator* op = &(cigar->getOperator(i));
219 
220  // If the operation is found in the reference, increase the
221  // reference position.
222  if(Cigar::foundInReference(*op))
223  {
224  // match, mismatch, deletion, skip
225  // increment the current reference position to just past
226  // this operation.
227  currentRefPosition += op->count;
228  }
229 
230  // Check to see if we have hit our clip position.
231  if(refPosition0Based < currentRefPosition)
232  {
233  // If this read is also in the query (match/mismatch),
234  // write the partial op to the new cigar.
235  int32_t numKeep = 0;
236  if(Cigar::foundInQuery(*op))
237  {
238  numKeep = op->count - (currentRefPosition - refPosition0Based);
239  if(numKeep > 0)
240  {
241  newCigar.Add(op->operation, numKeep);
242  readClipPosition += numKeep;
243  }
244  }
245  else if(Cigar::isClip(*op))
246  {
247  // This is a hard clip, so write it.
248  newCigar.Add(op->operation, op->count);
249  }
250  else
251  {
252 
253  // Not found in the query (skip/deletion),
254  // so don't write any of the operation.
255  }
256  // Found the clip point, so break.
257  break;
258  }
259  else if(refPosition0Based == currentRefPosition)
260  {
261  newCigar += *op;
262  if(Cigar::foundInQuery(*op))
263  {
264  readClipPosition += op->count;
265  }
266  }
267  else
268  {
269  // Not yet to the clip position, so add this operation/size to
270  // the new cigar.
271  newCigar += *op;
272  if(Cigar::foundInQuery(*op))
273  {
274  // Found in the query, so update the read clip position.
275  readClipPosition += op->count;
276  }
277  }
278  } // End loop through cigar.
279 
280  // Before adding the softclip, read from the end of the cigar checking to
281  // see if the operations are in the query, removing operations that are
282  // not (pad/delete/skip) until a hardclip or an operation in the query is
283  // found. We do not want a pad/delete/skip right before a softclip.
284  for(int j = newCigar.size() - 1; j >= 0; j--)
285  {
286  const Cigar::CigarOperator* op = &(newCigar.getOperator(j));
287  if(!Cigar::foundInQuery(*op) && !Cigar::isClip(*op))
288  {
289  // pad/delete/skip
290  newCigar.Remove(j);
291  }
292  else if(Cigar::foundInQuery(*op) & Cigar::isClip(*op))
293  {
294  // Soft clip, so increment the clip position for the return value.
295  // Remove the softclip since the readClipPosition is used to
296  // calculate teh size of the soft clip added.
297  readClipPosition -= op->count;
298  newCigar.Remove(j);
299  }
300  else
301  {
302  // Found a cigar operation that should not be deleted, so stop deleting.
303  break;
304  }
305  }
306 
307  // Determine the number of soft clips.
308  int32_t numSoftClips = record.getReadLength() - readClipPosition;
309 
310  if(numSoftClips < 0)
311  {
312  // Error, read len is less than the clip position
313  ErrorHandler::handleError("Soft clipping error: readLength is less than the clip position");
314  return(NO_CLIP);
315  }
316 
317  // NOTE that if the previous operation is a softclip, the CigarRoller logic
318  // will merge this with that one.
319  newCigar.Add(Cigar::softClip, numSoftClips);
320 
321  // Check if an ending hard clip needs to be added.
322  if(cigar->size() != 0)
323  {
324  const Cigar::CigarOperator* lastOp =
325  &(cigar->getOperator(cigar->size() - 1));
326  if(lastOp->operation == Cigar::hardClip)
327  {
328  newCigar += *lastOp;
329  }
330  }
331 
332  return(readClipPosition);
333 }
static int32_t softClipBeginByRefPos(SamRecord &record, int32_t refPosition0Based, CigarRoller &newCigar, int32_t &new0BasedPosition)
Soft clip the cigar from the beginning of the read at the specified reference position.
Definition: CigarHelper.cpp:23
static int32_t softClipEndByRefPos(SamRecord &record, int32_t refPosition0Based, CigarRoller &newCigar)
Soft clip the cigar from the back of the read at the specified reference position.
The purpose of this class is to provide accessors for setting, updating, modifying the CIGAR object....
Definition: CigarRoller.h:67
bool Remove(int index)
Remove the operation at the specified index.
void Add(Operation operation, int count)
Append the specified operation with the specified count to this object.
Definition: CigarRoller.cpp:77
void clear()
Clear this object so that it has no Cigar Operations.
void Set(const char *cigarString)
Sets this object to the specified cigarString.
This class represents the CIGAR without any methods to set the cigar (see CigarRoller for that).
Definition: Cigar.h:84
int size() const
Return the number of cigar operations.
Definition: Cigar.h:364
static bool foundInReference(Operation op)
Return true if the specified operation is found in the reference sequence, false if not.
Definition: Cigar.h:177
static bool foundInQuery(Operation op)
Return true if the specified operation is found in the query sequence, false if not.
Definition: Cigar.h:219
@ hardClip
Hard clip on the read (clipped sequence not present in the query sequence or reference)....
Definition: Cigar.h:95
@ softClip
Soft clip on the read (clipped sequence present in the query sequence, but not in reference)....
Definition: Cigar.h:94
const CigarOperator & getOperator(int i) const
Return the Cigar Operation at the specified index (starting at 0).
Definition: Cigar.h:354
static bool isClip(Operation op)
Return true if the specified operation is a clipping operation, false if not.
Definition: Cigar.h:261
static void handleError(const char *message, HandlingType handlingType=EXCEPTION)
Handle an error based on the error handling type.
Class providing an easy to use interface to get/set/operate on the fields in a SAM/BAM record.
Definition: SamRecord.h:52
Cigar * getCigarInfo()
Returns a pointer to the Cigar object associated with this record.
Definition: SamRecord.cpp:1836
int32_t getReadLength()
Get the length of the read.
Definition: SamRecord.cpp:1391
int32_t get0BasedAlignmentEnd()
Returns the 0-based inclusive rightmost position of the clipped sequence.
Definition: SamRecord.cpp:1467
int32_t get0BasedPosition()
Get the 0-based(BAM) leftmost position of the record.
Definition: SamRecord.cpp:1319
const char * getCigar()
Returns the SAM formatted CIGAR string.
Definition: SamRecord.cpp:1555