SeqAn3  3.0.2
The Modern C++ library for sequence analysis.
format_fastq.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <iterator>
16 #include <string>
17 #include <string_view>
18 #include <vector>
19 
40 #include <seqan3/std/algorithm>
41 #include <seqan3/std/ranges>
42 
43 namespace seqan3
44 {
45 
78 {
79 public:
83  format_fastq() noexcept = default;
84  format_fastq(format_fastq const &) noexcept = default;
85  format_fastq & operator=(format_fastq const &) noexcept = default;
86  format_fastq(format_fastq &&) noexcept = default;
87  format_fastq & operator=(format_fastq &&) noexcept = default;
88  ~format_fastq() noexcept = default;
90 
92  static inline std::vector<std::string> file_extensions
93  {
94  { "fastq" },
95  { "fq" }
96  };
97 
98 protected:
100  template <typename stream_type, // constraints checked by file
101  typename seq_legal_alph_type, bool seq_qual_combined,
102  typename seq_type, // other constraints checked inside function
103  typename id_type,
104  typename qual_type>
105  void read_sequence_record(stream_type & stream,
107  seq_type & sequence,
108  id_type & id,
109  qual_type & qualities)
110  {
111  auto stream_view = views::istreambuf(stream);
112  auto stream_it = begin(stream_view);
113 
114  // cache the begin position so we write quals to the same position as seq in seq_qual case
115  size_t sequence_size_before = 0;
116  size_t sequence_size_after = 0;
117  if constexpr (!detail::decays_to_ignore_v<seq_type>)
118  sequence_size_before = size(sequence);
119 
120  /* ID */
121  if (*stream_it != '@') // [[unlikely]]
122  {
123  throw parse_error{std::string{"Expected '@' on beginning of ID line, got: "} +
124  detail::make_printable(*stream_it)};
125  }
126  ++stream_it; // skip '@'
127 
128  if constexpr (!detail::decays_to_ignore_v<id_type>)
129  {
130  if (options.truncate_ids)
131  {
132  std::ranges::copy(stream_view | views::take_until_or_throw(is_cntrl || is_blank)
133  | views::char_to<std::ranges::range_value_t<id_type>>,
134  std::cpp20::back_inserter(id));
135  detail::consume(stream_view | views::take_line_or_throw);
136  }
137  else
138  {
139  std::ranges::copy(stream_view | views::take_line_or_throw
140  | views::char_to<std::ranges::range_value_t<id_type>>,
141  std::cpp20::back_inserter(id));
142  }
143  }
144  else
145  {
146  detail::consume(stream_view | views::take_line_or_throw);
147  }
148 
149  /* Sequence */
150  auto seq_view = stream_view | views::take_until_or_throw(is_char<'+'>) // until 2nd ID line
151  | std::views::filter(!is_space); // ignore whitespace
152  if constexpr (!detail::decays_to_ignore_v<seq_type>)
153  {
154  auto constexpr is_legal_alph = is_in_alphabet<seq_legal_alph_type>;
155  std::ranges::copy(seq_view | std::views::transform([is_legal_alph] (char const c) // enforce legal alphabet
156  {
157  if (!is_legal_alph(c))
158  {
159  throw parse_error{std::string{"Encountered an unexpected letter: "} +
160  is_legal_alph.msg +
161  " evaluated to false on " +
162  detail::make_printable(c)};
163  }
164  return c;
165  })
166  | views::char_to<std::ranges::range_value_t<seq_type>>, // convert to actual target alphabet
167  std::cpp20::back_inserter(sequence));
168  sequence_size_after = size(sequence);
169  }
170  else // consume, but count
171  {
172  auto it = begin(seq_view);
173  auto it_end = end(seq_view);
174  while (it != it_end)
175  {
176  ++it;
177  ++sequence_size_after;
178  }
179  }
180 
181  /* 2nd ID line */
182  if (*stream_it != '+') // [[unlikely]]
183  {
184  throw parse_error{std::string{"Expected '+' on beginning of 2nd ID line, got: "} +
185  detail::make_printable(*stream_it)};
186  }
187  detail::consume(stream_view | views::take_line_or_throw);
188 
189  /* Qualities */
190  auto qview = stream_view | std::views::filter(!is_space) // this consumes trailing newline
191  | views::take_exactly_or_throw(sequence_size_after - sequence_size_before);
192  if constexpr (seq_qual_combined)
193  {
194  // seq_qual field implies that they are the same variable
195  assert(std::addressof(sequence) == std::addressof(qualities));
196  std::ranges::copy(qview | views::char_to<typename std::ranges::range_value_t<qual_type>::quality_alphabet_type>,
197  begin(qualities) + sequence_size_before);
198  }
199  else if constexpr (!detail::decays_to_ignore_v<qual_type>)
200  {
201  std::ranges::copy(qview | views::char_to<std::ranges::range_value_t<qual_type>>,
202  std::cpp20::back_inserter(qualities));
203  }
204  else
205  {
206  detail::consume(qview);
207  }
208  }
209 
211  template <typename stream_type, // constraints checked by file
212  typename seq_type, // other constraints checked inside function
213  typename id_type,
214  typename qual_type>
215  void write_sequence_record(stream_type & stream,
216  sequence_file_output_options const & options,
217  seq_type && sequence,
218  id_type && id,
219  qual_type && qualities)
220  {
221  seqan3::detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
222 
223  // ID
224  if constexpr (detail::decays_to_ignore_v<id_type>)
225  {
226  throw std::logic_error{"The ID field may not be set to ignore when writing FASTQ files."};
227  }
228  else
229  {
230  if (std::ranges::empty(id)) //[[unlikely]]
231  throw std::runtime_error{"The ID field may not be empty when writing FASTQ files."};
232 
233  stream_it = '@';
234  stream_it.write_range(id);
235  stream_it.write_end_of_line(options.add_carriage_return);
236  }
237 
238  // Sequence
239  if constexpr (detail::decays_to_ignore_v<seq_type>)
240  {
241  throw std::logic_error{"The SEQ and SEQ_QUAL fields may not both be set to ignore when writing FASTQ files."};
242  }
243  else
244  {
245  if (std::ranges::empty(sequence)) //[[unlikely]]
246  throw std::runtime_error{"The SEQ field may not be empty when writing FASTQ files."};
247 
248  stream_it.write_range(sequence | views::to_char);
249  stream_it.write_end_of_line(options.add_carriage_return);
250  }
251 
252  // 2nd ID-line
253  if constexpr (!detail::decays_to_ignore_v<id_type>)
254  {
255  stream_it = '+';
256 
257  if (options.fastq_double_id)
258  stream_it.write_range(id);
259 
260  stream_it.write_end_of_line(options.add_carriage_return);
261  }
262 
263  // Quality line
264  if constexpr (detail::decays_to_ignore_v<qual_type>)
265  {
266  throw std::logic_error{"The QUAL and SEQ_QUAL fields may not both be set to ignore when writing FASTQ files."};
267  }
268  else
269  {
270  if (std::ranges::empty(qualities)) //[[unlikely]]
271  throw std::runtime_error{"The SEQ field may not be empty when writing FASTQ files."};
272 
273  if constexpr (std::ranges::sized_range<seq_type> && std::ranges::sized_range<qual_type>)
274  {
275  assert(std::ranges::size(sequence) == std::ranges::size(qualities));
276  }
277 
278  stream_it.write_range(qualities | views::to_char);
279  stream_it.write_end_of_line(options.add_carriage_return);
280  }
281  }
282 };
283 
284 } // namespace seqan
T addressof(T... args)
Adaptations of algorithms from the Ranges TS.
Provides aliases for qualified.
Provides alphabet adaptations for standard char types.
Provides seqan3::views::char_to.
The FastQ format.
Definition: format_fastq.hpp:78
void read_sequence_record(stream_type &stream, sequence_file_input_options< seq_legal_alph_type, seq_qual_combined > const &options, seq_type &sequence, id_type &id, qual_type &qualities)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_fastq.hpp:105
format_fastq() noexcept=default
Defaulted.
void write_sequence_record(stream_type &stream, sequence_file_output_options const &options, seq_type &&sequence, id_type &&id, qual_type &&qualities)
Write the given fields to the specified stream.
Definition: format_fastq.hpp:215
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_fastq.hpp:93
Provides seqan3::dna5, container aliases and string literals.
constexpr auto is_blank
Checks whether c is a blank character.
Definition: predicate.hpp:163
constexpr auto is_space
Checks whether c is a space character.
Definition: predicate.hpp:146
constexpr auto is_cntrl
Checks whether c is a control character.
Definition: predicate.hpp:110
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:116
seqan3::type_list< trait_t< pack_t >... > transform
Apply a transformation trait to every type in the pack and return a seqan3::type_list of the results.
Definition: traits.hpp:307
auto const to_char
A view that calls seqan3::to_char() on each element in the input range.
Definition: to_char.hpp:65
constexpr auto take_until_or_throw
A view adaptor that returns elements from the underlying range until the functor evaluates to true (t...
Definition: take_until.hpp:624
constexpr auto istreambuf
A view factory that returns a view over the stream buffer of an input stream.
Definition: istreambuf.hpp:113
constexpr auto take_exactly_or_throw
A view adaptor that returns the first size elements from the underlying range and also exposes size i...
Definition: take_exactly.hpp:91
auto const char_to
A view over an alphabet, given a range of characters.
Definition: char_to.hpp:69
constexpr auto take_line_or_throw
A view adaptor that returns a single line from the underlying range (throws if there is no end-of-lin...
Definition: take_line.hpp:90
Provides seqan3::detail::ignore_output_iterator for writing to null stream.
The generic concept for a sequence.
Provides various utility functions.
Provides seqan3::fast_istreambuf_iterator and seqan3::fast_ostreambuf_iterator, as well as,...
Provides seqan3::views::istreambuf.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
SeqAn specific customisations in the standard namespace.
Provides character predicates for tokenisation.
Provides various utility functions.
Provides various transformation traits used by the range module.
Adaptations of concepts from the Ranges TS.
Provides seqan3::sequence_file_input_format and auxiliary classes.
Provides seqan3::sequence_file_input_options.
Provides seqan3::sequence_file_output_format and auxiliary classes.
Provides seqan3::sequence_file_output_options.
Thrown if there is a parse error, such as reading an unexpected character from an input stream.
Definition: exception.hpp:48
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:26
bool truncate_ids
Read the ID string only up until the first whitespace character.
Definition: input_options.hpp:28
The options type defines various option members that influence the behaviour of all or some formats.
Definition: output_options.hpp:22
bool add_carriage_return
The default plain text line-ending is "\n", but on Windows an additional carriage return is recommend...
Definition: output_options.hpp:39
bool fastq_double_id
Whether to write the ID only '@' or also after '+' line.
Definition: output_options.hpp:34
Provides seqan3::views::take.
Provides seqan3::views::take_exactly and seqan3::views::take_exactly_or_throw.
Provides seqan3::views::take_line and seqan3::views::take_line_or_throw.
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
Provides seqan3::views::to_char.