libStatGen Software  1
BgzfFileTypeRecovery.cpp
1 /*
2  * Copyright (C) 2010 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #ifdef __ZLIB_AVAILABLE__
19 
20 #include "BgzfFileTypeRecovery.h"
21 
22 #include <stdio.h>
23 #include <stdint.h>
24 
25 #include <sys/types.h>
26 #include <sys/stat.h>
27 #include <fcntl.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <zlib.h>
31 
32 #include <fstream>
33 #include <iostream>
34 #include <stdexcept>
35 #include <vector>
36 
37 #pragma pack(push,1)
38 
39 #define debug false
40 
41 class RecoveryGzipHeader {
42 private:
43  uint8_t m_ID1;
44  uint8_t m_ID2;
45  uint8_t m_CM;
46  uint8_t m_FLG;
47  uint32_t m_MTIME;
48  uint8_t m_XFL;
49  uint8_t m_OS;
50  uint16_t m_XLEN;
51 public:
52  RecoveryGzipHeader() :
53  m_ID1(0),
54  m_ID2(0),
55  m_CM(0),
56  m_FLG(0),
57  m_MTIME(0),
58  m_XFL(0),
59  m_OS(0),
60  m_XLEN(0)
61  {;}
62 
63  void defaults() {
64  m_ID1 = 31;
65  m_ID2 = 139;
66  m_CM = 8;
67  m_FLG = 4;
68  m_MTIME = 0;
69  m_XFL = 0;
70  m_OS = 255;
71  m_XLEN = 6;
72  }
73  uint8_t ID1() {return m_ID1;}
74  uint8_t ID2() {return m_ID2;}
75  uint8_t CM() {return m_CM;}
76  uint8_t FLG() {return m_FLG;}
77  uint32_t MTIME() {return m_MTIME;}
78  uint8_t XFL() {return m_XFL;}
79  uint8_t OS() {return m_OS;}
80  uint16_t XLEN() {return m_XLEN;}
81  bool sane() {
82  return (m_ID1==31 && m_ID2==139 && m_CM==8 && m_FLG==4 && m_MTIME==0 && m_XFL == 0 && m_OS == 255 && m_XLEN==6);
83  }
84 };
85 
86 class BGZFHeader : public RecoveryGzipHeader {
87 private:
88  uint8_t m_SI1;
89  uint8_t m_SI2;
90  uint16_t m_SLEN; // little endian
91  uint16_t m_BSIZE; // little endian
92 public:
93  BGZFHeader(
94  uint8_t m_SI1 = 'B',
95  uint8_t m_SI2 = 'C',
96  uint16_t m_SLEN = 2,
97  uint16_t m_BSIZE = 0
98  ) : m_SI1(m_SI1), m_SI2(m_SI2), m_SLEN(m_SLEN), m_BSIZE(m_BSIZE) {;}
99  uint8_t SI1() {return m_SI1;}
100  uint8_t SI2() {return m_SI2;}
101  uint16_t SLEN() {return m_SLEN;}
102  uint16_t BSIZE() {return m_BSIZE;}
103  bool sane() {
104  return RecoveryGzipHeader::sane() &&
105  (m_SI1=='B' && m_SI2=='C' && m_SLEN==2 && m_BSIZE > sizeof(BGZFHeader));
106  }
107 };
108 
109 #pragma pack(pop)
110 
111 //
112 // PeekaheadBuffer allows non-destructive peekahead and resyncing
113 // after read errors when the underlying stream has signatures in the
114 // data that allow it.
115 //
116 // In addition, it has a peek() capability to allow
117 // the caller to look ahead in the stream to see
118 // a certain number of bytes before actually consuming them.
119 //
120 // The intent is that this class behave as something of a poor
121 // man's FIFO - with the cost of buffer movement when data is removed.
122 //
123 // This is far from ideal, but we basically are moving data around
124 // when allowing arbitrary peekahead regardless.
125 //
126 // The basis for the design is the fact that most read calls to
127 // various streams at best allow a single character to be peeked
128 // at, and secondly, do not allow for recovery after an underfling
129 // framing error occurs.
130 //
131 // That is, getchar()/putchar/ungetchar() support a single byte
132 // peek. This may be fine for simply parsing applications, but here
133 // we need to look at blocks up to 64K or more in size to search
134 // for signatures while re-synchronizing on the underlying stream.
135 //
136 class PeekaheadBuffer : public std::vector<uint8_t> {
137 
138 protected:
139  ssize_t m_startPosition; // start of fresh data
140 
141 public:
142  enum ReturnCode {
143  endOfFile = -1,
144  reSync = 0,
145  ok = 1
146  };
147 
148  ssize_t startPosition() {return m_startPosition;}
149 
150 private:
151  //
152  // when remaining data is 1/8 the size of the full
153  // buffer, shift it back down to the start.
154  //
155  // for use by read(), which will consume data from the buffer.
156  //
157  void shiftData() {
158  if(dataRemaining() < (ssize_t) (std::vector<uint8_t>::size() / 8) ) {
159  erase(begin(), begin() + m_startPosition);
160  m_startPosition = 0;
161  }
162  }
163  // called when read reports an error for some
164  // reason -
165  virtual ReturnCode sync();
166 public:
167  PeekaheadBuffer();
168  virtual ~PeekaheadBuffer();
169 
170  // return the amount of unused data:
171  ssize_t dataRemaining();
172 
173  //
174  // overload size() to throw an exception - too confusing otherwise
175 // size_t size() {abort();}
176 
177  //
178  // just populate data in buffer from stream - not generic
179  //
180  // XXX note that it simply ensures that count bytes of data
181  // are actually loaded into the buffer - if that amount of
182  // data (or more) is present, this call is a NOP.
183  //
184  virtual ReturnCode readahead(ssize_t count) = 0;
185 
186  // read is generic.
187  // remove data from our buffer - call non-generic readahead to populate data.
188  ReturnCode read(uint8_t *buffer, ssize_t count) {
189  ReturnCode rc;
190 
191  rc = readahead(count);
192 
193  if(rc == ok) {
194  uint8_t *src = &(*begin()) + m_startPosition;
195  uint8_t *dest = buffer;
196 
197  memcpy(dest, src, count);
198 
199  m_startPosition += count; // consume data
200 
201  // recover space if wasting too much:
202  shiftData();
203  } else if(rc == reSync) {
204  // peek puked - CRC error, other errors, see if we can sync forwards
205  return reSync;
206  } else {
207  // failed to get needed data - premature EOF, I guess
208  return endOfFile;
209  }
210 
211  return ok;
212  }
213 
214 };
215 
216 PeekaheadBuffer::PeekaheadBuffer() : m_startPosition(0)
217 {
218 }
219 
220 PeekaheadBuffer::~PeekaheadBuffer()
221 {
222 }
223 
224 PeekaheadBuffer::ReturnCode PeekaheadBuffer::sync() {
225  clear();
226  return ok;
227 }
228 
229 ssize_t PeekaheadBuffer::dataRemaining()
230 {
231  return std::vector<uint8_t>::size() - m_startPosition;
232 }
233 
234 
235 // peekahead buffered file reader class
236 class FileReader : public PeekaheadBuffer {
237  FILE *m_stream;
238 public:
239  FileReader();
240  ~FileReader();
241  FileReader(FILE *stream);
242  PeekaheadBuffer::ReturnCode readahead(ssize_t count);
243  FILE *stream() {return m_stream;}
244  bool eof() {return m_stream ? feof(m_stream) : false;}
245 };
246 
247 FileReader::FileReader()
248 {
249  m_stream = NULL;
250 }
251 
252 FileReader::FileReader(FILE *stream) : m_stream(stream)
253 {
254 }
255 
256 FileReader::~FileReader()
257 {
258  fclose(m_stream);
259  m_stream = NULL;
260 }
261 
262 //
263 // fill buffer until we have count bytes of valid
264 // data.
265 //
266 // need to detect error and eof and return appropriate values.
267 //
268 PeekaheadBuffer::ReturnCode FileReader::readahead(ssize_t count)
269 {
270  uint8_t buffer[4096];
271  while(dataRemaining() < count) {
272  int bytesRead = fread(buffer, 1, sizeof(buffer), m_stream);
273  if(bytesRead==0) {
274  if(ferror(m_stream)) {
275  return reSync;
276  }
277  // ain't getting no more data...
278  return endOfFile;
279  }
280 #if 0
281  fprintf(stderr, "\n\n");
282  int possible = -1;
283  for(int i=0;i<bytesRead;i+=16) {
284  fprintf(stderr,"%08x: ", i);
285  for(int j=0;j<16;j++) {
286  if(buffer[i+j]==31 && buffer[i+j+1]==139) {
287  possible = i+j;
288  }
289  fprintf(stderr,"%02x ", buffer[i+j]);
290  }
291  fprintf(stderr, "\n");
292  }
293  if(possible>0) {
294  fprintf(stderr,"possible signature at %08x\n", possible);
295  }
296 #endif
297  insert(end(), &buffer[0], &buffer[0] + bytesRead);
298  }
299  return ok;
300 }
301 
302 class BGZFReader : public PeekaheadBuffer {
303  FileReader m_fileReader;
304 
305 public:
306 
307  BGZFReader(FILE *stream) : m_fileReader(stream) {;}
308 
309  PeekaheadBuffer::ReturnCode readahead(ssize_t count);
310 
311  //
312  // This will be reading data, and needs to return EOF, etc
313  //
314  ReturnCode sync() {
315  // my internal data is now bad, so we'll scan ahead seeing
316  // if we can find a good header
317  clear();
318  PeekaheadBuffer::ReturnCode rc;
319  while((rc = m_fileReader.readahead(sizeof(BGZFHeader)))==ok ) {
320  BGZFHeader *header;
321  if(rc==endOfFile) return rc;
322  // a rc==reSync is ok provided readahead still ensures that header is present
323  void *src = &(*(m_fileReader.begin())) + m_fileReader.startPosition();
324  header = (BGZFHeader *) src;
325  if(header->sane()) {
326  if(debug) std::cerr << "BGZFReader::sync returning reSync\n";
327  return reSync; // tell caller they need to sync up
328  }
329  // consume a byte, then see if we're at a valid block header
330  uint8_t throwAwayBuffer;
331  rc = m_fileReader.read(&throwAwayBuffer, 1);
332  }
333  return rc;
334  }
335  FILE *stream() {return m_fileReader.stream();}
336 
337  bool eof() {return dataRemaining()==0 && m_fileReader.eof();}
338 
339 };
340 
341 PeekaheadBuffer::ReturnCode BGZFReader::readahead(ssize_t count)
342 {
343  BGZFHeader header;
344  // size of inflateBuffer can be determined from ISIZE, I think
345  uint8_t inflateBuffer[64*1024];
346  uint8_t gzipBuffer[64*1024+1];
347 
348  while(dataRemaining() < count) {
349  static int loopCount = 0;
350 
351  if(debug) std::cerr << "BGZFReader::readahead loopcount = " << loopCount++ << "\n";
352 
353  // here we actually read data:
354  // read what should be the header
355  // verify the header
356  // read the remainder of the block
357  // check the CRC validity or perhaps just call unzip
358  //
359  // XXX the sizeof(header) is wrong:
360  PeekaheadBuffer::ReturnCode rc = m_fileReader.read((uint8_t *) (&header), sizeof(header));
361 
362  if(rc == endOfFile) {
363  return endOfFile;
364  }
365 
366  // if we have a bad header, start looking forward for a good one,
367  if(!header.sane()) {
368  // sync does not consume the next good header, it simply syncs()
369  // the data stream to the next believed good BGZF header:
370  if(debug) std::cerr << "BGZFReader::readahead found corrupt BGZF header - now calling sync()\n";
371  rc = sync();
372  //
373  // even though we can now decompress, we need to tell the caller
374  // what is up before they call for more data (caller needs to
375  // sync its own record stream):
376  return rc;
377  }
378 
379  // Read the remainder of the block.
380  // BSIZE is size of the entire block - 1, so compensate.
381  rc = m_fileReader.read((uint8_t *) &gzipBuffer, header.BSIZE() + 1 - sizeof(header));
382 
383  if(rc == reSync) {
384  if(debug) std::cerr << "BGZFReader::readahead got incomplete BGZF read - now calling sync()\n";
385  sync();
386  return reSync;
387  }
388 
389  //
390  // we read a header, but our attempt to read more data ended early,
391  // so best to just return EOF
392  //
393  if(rc == endOfFile) {
394  return rc;
395  }
396 
397  PeekaheadBuffer::ReturnCode bgzf_rc = ok;
398  // zs.opaque is set when zalloc is NULL
399  //
400  // NB: zlib inflateInit2() has valgrind errors
401  // in versions <1.2.4 - those can be ignored.
402  //
403  z_stream zs;
404  zs.zalloc = NULL;
405  zs.zfree = NULL;
406  zs.next_in = gzipBuffer;
407  zs.avail_in = header.BSIZE() - 16; // XXX need to check docs for inflate
408  zs.next_out = inflateBuffer;
409  zs.avail_out = sizeof(inflateBuffer);
410 
411  // -15 --> raw inflate - don't look for gzip or zlib header
412  // This can be optimized - inflateInit2 does a malloc of
413  // approximately 10K (sizeof(inflate_state))
414  if(inflateInit2(&zs, -15) != Z_OK) {
415  bgzf_rc = reSync;
416  if(debug) std::cerr << "BGZFReader::readahead - inflateInit2 failed (out of memory?)\n";
417  // XXX fatal?
418  }
419  if(bgzf_rc==ok && inflate(&zs, Z_FINISH) != Z_STREAM_END) {
420  bgzf_rc = reSync;
421  if(debug) std::cerr << "BGZFReader::readahead - inflate failed (bad data), calling sync()\n";
422  }
423 
424  if(bgzf_rc == ok) {
425  if(inflateEnd(&zs) == Z_OK) {
426  // do something with zs.total_out
427  if(debug) std::cout << "hey, got data! zs.total_out == " << zs.total_out << "\n";
428 
429  // append the newly decompressed data
430  insert(end(), &inflateBuffer[0], &inflateBuffer[0] + zs.total_out);
431  } else {
432  // seems exceptionall unlikely, but check this error case too
433  bgzf_rc = reSync;
434  if(debug) std::cerr << "BGZFReader::readahead - inflateInit2 failed (out of memory?)\n";
435  // XXX fatal?
436  }
437  }
438 
439  if(bgzf_rc != ok) {
440  inflateEnd(&zs);
441  sync();
442  return bgzf_rc;
443  }
444 
445  // may need to get more data - loop back till all is complete
446  }
447 
448  return ok;
449 
450 }
451 
452 
453 #if 0
454 void testBGZFBuffer()
455 {
456  BGZFReader b(stdin);
457  std::vector<uint8_t>::iterator position;
458  BGZFReader::ReturnCode rc;
459 
460  std::cout << "size = " << b.dataRemaining() << "\n";
461 
462  //
463  // this should:
464  // decompress a BGZF block, populating the buffer with
465  // unzipped data, possibly returning a BGZFBuffer::ReturnCode of
466  // resync if it turns out the BGZF data was interrupted by bad
467  // CRC checks.
468  //
469  rc = b.readahead(64);
470  std::cout << "rc = " << rc << " - expect ok (1)\n";
471  std::cout << "size (expect 64) = " << b.size() << "\n";
472 }
473 
474 
475 int main(int argc, const char **argv)
476 {
477  testBGZFBuffer();
478 }
479 #endif
480 
481 
482 
483 int BgzfFileTypeRecovery::close()
484 {
485  if(bgzfReader) delete bgzfReader;
486  bgzfReader = NULL;
487  return true;
488 }
489 
490 
491 BgzfFileTypeRecovery::BgzfFileTypeRecovery(const char * filename, const char * mode)
492 {
493  if(tolower(mode[0])=='r') {
494  FILE *f = fopen(filename,"r");
495  bgzfReader = new BGZFReader(f);
496  } else {
497  // die for now
498  if(debug) std::cerr << "Unable to open " << filename << " in mode " << mode << ".\n";
499  close();
500  }
501 }
502 
503 //
504 // Why is this ever called?
505 //
506 bool BgzfFileTypeRecovery::operator == (void * rhs)
507 {
508  throw std::logic_error("BgzfFileTypeRecovery::operator == is dangerous - do not use");
509  return false;
510 }
511 
512 bool BgzfFileTypeRecovery::operator != (void * rhs)
513 {
514  throw std::logic_error("BgzfFileTypeRecovery::operator != is dangerous - do not use");
515  return false;
516 }
517 
518 int BgzfFileTypeRecovery::eof()
519 {
520  return bgzfReader->eof();
521 }
522 
523 unsigned int BgzfFileTypeRecovery::write(const void * buffer, unsigned int size)
524 {
525  // currently unsupported
526  return 0;
527 }
528 
529 int BgzfFileTypeRecovery::read(void * buffer, unsigned int size)
530 {
531 
532  if(bgzfReader == NULL) {
533  return 0;
534  }
535 
536  PeekaheadBuffer::ReturnCode rc = bgzfReader->read((uint8_t *) buffer, size);
537  // endOfFile = -1,
538 // reSync = 0,
539 // ok = 1
540  switch(rc) {
541  case PeekaheadBuffer::endOfFile:
542  // set a flag?
543  return 0;
544  case PeekaheadBuffer::reSync:
545  // we could encode more info in the exception message here:
546  if(debug) std::cerr << "throwing BGZF sync exception\n";
547  throw std::runtime_error("BGZF stream resync");
548  case PeekaheadBuffer::ok:
549  //
550  // in bgzfReader, we always are ensured we
551  // get the full amount of the read, otherwise
552  // an error is thrown.
553  //
554  return size;
555  }
556  // NOTREACHED
557  return 0;
558 }
559 
560 int64_t BgzfFileTypeRecovery::tell()
561 {
562  // currently unsupported
563  return 0;
564 }
565 
566 bool BgzfFileTypeRecovery::seek(int64_t offset, int origin)
567 {
568  // currently unsupported
569  return 0;
570 }
571 
572 
573 bool BgzfFileTypeRecovery::attemptRecoverySync(bool (*checkSignature)(void *data) , int length)
574 {
575  //
576  // creep along a byte at a time, checking for signature.
577  //
578  // possibly slow. should only need to scan ahead < 64K bytes
579  // or so, however, so should recover in "reasonable" time.
580  //
581  while( bgzfReader->readahead(length) == PeekaheadBuffer::ok) {
582  char ch;
583  void *src = &(*(bgzfReader->begin())) + bgzfReader->startPosition();
584 
585  //
586  // readahead ensures we have 'length' bytes of
587  // data to check that is valid in the buffer.
588  //
589  if((*checkSignature)(src)) return true;
590  PeekaheadBuffer::ReturnCode rc = bgzfReader->read((uint8_t *) &ch, 1);
591  if(rc!=PeekaheadBuffer::ok) return false;
592  // we consumed a byte, so go back to top of loop,
593  // resume filling buffer (if need be) and re-check
594  }
595 
596 
597  return false;
598 }
599 
600 #endif