SeqAn3  3.0.2
The Modern C++ library for sequence analysis.
format_vienna.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <cstdio>
16 #include <iterator>
17 #include <stack>
18 #include <string>
19 #include <string_view>
20 #include <type_traits>
21 #include <vector>
22 
42 #include <seqan3/std/algorithm>
43 #include <seqan3/std/ranges>
44 
45 namespace seqan3
46 {
84 {
85 public:
89  format_vienna() noexcept = default;
90  format_vienna(format_vienna const &) noexcept = default;
91  format_vienna & operator=(format_vienna const &) noexcept = default;
92  format_vienna(format_vienna &&) noexcept = default;
93  format_vienna & operator=(format_vienna &&) noexcept = default;
94  ~format_vienna() noexcept = default;
96 
98  static inline std::vector<std::string> file_extensions
99  {
100  { "dbn" },
101  { "fasta" },
102  { "fa" }
103  };
104 
105 protected:
107  template <typename stream_type, // constraints checked by file
108  typename seq_legal_alph_type,
109  bool structured_seq_combined,
110  typename seq_type, // other constraints checked inside function
111  typename id_type,
112  typename bpp_type,
113  typename structure_type,
114  typename energy_type,
115  typename react_type,
116  typename comment_type,
117  typename offset_type>
118  void read_structure_record(stream_type & stream,
120  seq_type & seq,
121  id_type & id,
122  bpp_type & bpp,
123  structure_type & structure,
124  energy_type & energy,
125  react_type & SEQAN3_DOXYGEN_ONLY(react),
126  react_type & SEQAN3_DOXYGEN_ONLY(react_err),
127  comment_type & SEQAN3_DOXYGEN_ONLY(comment),
128  offset_type & SEQAN3_DOXYGEN_ONLY(offset))
129  {
130  auto stream_view = views::istreambuf(stream);
131 
132  // READ ID (if present)
133  auto constexpr is_id = is_char<'>'>;
134  if (is_id(*begin(stream_view)))
135  {
136  if constexpr (!detail::decays_to_ignore_v<id_type>)
137  {
138  if (options.truncate_ids)
139  {
140  std::ranges::copy(stream_view | std::views::drop_while(is_id || is_blank) // skip leading >
141  | views::take_until_or_throw(is_cntrl || is_blank)
142  | views::char_to<std::ranges::range_value_t<id_type>>,
143  std::cpp20::back_inserter(id));
144  detail::consume(stream_view | views::take_line_or_throw);
145  }
146  else
147  {
148  std::ranges::copy(stream_view | std::views::drop_while(is_id || is_blank) // skip leading >
149  | views::take_line_or_throw
150  | views::char_to<std::ranges::range_value_t<id_type>>,
151  std::cpp20::back_inserter(id));
152  }
153  }
154  else
155  {
156  detail::consume(stream_view | views::take_line_or_throw);
157  }
158  }
159  else if constexpr (!detail::decays_to_ignore_v<id_type>)
160  {
161  auto constexpr is_legal_seq = is_in_alphabet<seq_legal_alph_type>;
162  if (!is_legal_seq(*begin(stream_view))) // if neither id nor seq found: throw
163  {
164  throw parse_error{std::string{"Expected to be on beginning of ID or sequence, but "} +
165  is_id.msg + " and " + is_legal_seq.msg +
166  " evaluated to false on " + detail::make_printable(*begin(stream_view))};
167  }
168  }
169 
170  // READ SEQUENCE
171  if constexpr (!detail::decays_to_ignore_v<seq_type>)
172  {
173  auto constexpr is_legal_seq = is_in_alphabet<seq_legal_alph_type>;
174  std::ranges::copy(stream_view | views::take_line_or_throw // until end of line
175  | std::views::filter(!(is_space || is_digit)) // ignore whitespace and numbers
176  | std::views::transform([is_legal_seq](char const c)
177  {
178  if (!is_legal_seq(c)) // enforce legal alphabet
179  {
180  throw parse_error{std::string{"Encountered an unexpected letter: "} +
181  is_legal_seq.msg +
182  " evaluated to false on " +
183  detail::make_printable(c)};
184  }
185  return c;
186  })
187  | views::char_to<std::ranges::range_value_t<seq_type>>, // convert to actual target alphabet
188  std::cpp20::back_inserter(seq));
189  }
190  else
191  {
192  detail::consume(stream_view | views::take_line_or_throw);
193  }
194 
195  // READ STRUCTURE (if present)
196  [[maybe_unused]] int64_t structure_length{};
197  if constexpr (!detail::decays_to_ignore_v<structure_type>)
198  {
199  if constexpr (structured_seq_combined)
200  {
201  assert(std::addressof(seq) == std::addressof(structure));
202  using alph_type = typename std::ranges::range_value_t<structure_type>::structure_alphabet_type;
203  // We need the structure_length parameter to count the length of the structure while reading
204  // because we cannot infer it from the (already resized) structure_seq object.
205  auto res = std::ranges::copy(read_structure<alph_type>(stream_view), std::ranges::begin(structure));
206  structure_length = std::ranges::distance(std::ranges::begin(structure), res.out);
207 
208  if constexpr (!detail::decays_to_ignore_v<bpp_type>)
209  detail::bpp_from_rna_structure<alph_type>(bpp, structure);
210  }
211  else
212  {
213  using alph_type = std::ranges::range_value_t<structure_type>;
214  std::ranges::copy(read_structure<alph_type>(stream_view), std::cpp20::back_inserter(structure));
215  structure_length = std::ranges::distance(structure);
216 
217  if constexpr (!detail::decays_to_ignore_v<bpp_type>)
218  detail::bpp_from_rna_structure<alph_type>(bpp, structure);
219  }
220  }
221  else if constexpr (!detail::decays_to_ignore_v<bpp_type>)
222  {
223  detail::bpp_from_rna_structure<wuss51>(bpp, read_structure<wuss51>(stream_view));
224  structure_length = std::ranges::distance(bpp);
225  }
226  else
227  {
228  detail::consume(stream_view | views::take_until(is_space)); // until whitespace
229  }
230 
231  if constexpr (!detail::decays_to_ignore_v<seq_type> &&
232  !(detail::decays_to_ignore_v<structure_type> && detail::decays_to_ignore_v<bpp_type>))
233  {
234  if (std::ranges::distance(seq) != structure_length)
235  throw parse_error{"Found sequence and associated structure of different length."};
236  }
237 
238  // READ ENERGY (if present)
239  if constexpr (!detail::decays_to_ignore_v<energy_type>)
240  {
241  std::string e_str = stream_view | views::take_line
242  | std::views::filter(!(is_space || is_char<'('> || is_char<')'>))
243  | views::to<std::string>;
244  if (!e_str.empty())
245  {
246  size_t num_processed;
247  energy = std::stod(e_str, &num_processed);
248  if (num_processed != e_str.size()) // [[unlikely]]
249  {
250  throw parse_error{std::string{"Failed to parse energy value '"} + e_str + "'."};
251  }
252  }
253  }
254  else
255  {
256  detail::consume(stream_view | views::take_line);
257  }
258  detail::consume(stream_view | views::take_until(!is_space));
259  }
260 
262  template <typename stream_type, // constraints checked by file
263  typename seq_type, // other constraints checked inside function
264  typename id_type,
265  typename bpp_type,
266  typename structure_type,
267  typename energy_type,
268  typename react_type,
269  typename comment_type,
270  typename offset_type>
271  void write_structure_record(stream_type & stream,
272  structure_file_output_options const & options,
273  seq_type && seq,
274  id_type && id,
275  bpp_type && SEQAN3_DOXYGEN_ONLY(bpp),
276  structure_type && structure,
277  energy_type && energy,
278  react_type && SEQAN3_DOXYGEN_ONLY(react),
279  react_type && SEQAN3_DOXYGEN_ONLY(react_err),
280  comment_type && SEQAN3_DOXYGEN_ONLY(comment),
281  offset_type && SEQAN3_DOXYGEN_ONLY(offset))
282  {
283  std::cpp20::ostreambuf_iterator stream_it{stream};
284 
285  // WRITE ID (optional)
286  if constexpr (!detail::decays_to_ignore_v<id_type>)
287  {
288  if (!std::ranges::empty(id))
289  {
290  stream_it = '>';
291  stream_it = ' ';
292  std::ranges::copy(id, stream_it);
293  detail::write_eol(stream_it, options.add_carriage_return);
294  }
295  }
296 
297  // WRITE SEQUENCE
298  if constexpr (!detail::decays_to_ignore_v<seq_type>)
299  {
300  if (std::ranges::empty(seq)) //[[unlikely]]
301  throw std::runtime_error{"The SEQ field may not be empty when writing Vienna files."};
302 
303  std::ranges::copy(seq | views::to_char, stream_it);
304  detail::write_eol(stream_it, options.add_carriage_return);
305  }
306  else
307  {
308  throw std::logic_error{"The SEQ and STRUCTURED_SEQ fields may not both be set to ignore "
309  "when writing Vienna files."};
310  }
311 
312  // WRITE STRUCTURE (optional)
313  if constexpr (!detail::decays_to_ignore_v<structure_type>)
314  {
315  if (!std::ranges::empty(structure))
316  std::ranges::copy(structure | views::to_char, stream_it);
317 
318  // WRITE ENERGY (optional)
319  if constexpr (!detail::decays_to_ignore_v<energy_type>)
320  {
321  if (energy)
322  {
323 // TODO(joergi-w) enable the following when std::to_chars is implemented for float types
324 // auto [endptr, ec] = std::to_chars(str.data(),
325 // str.data() + str.size(),
326 // energy,
327 // std::chars_format::fixed,
328 // options.precision);
329 // if (ec == std::errc())
330 // std::ranges::copy(str.data(), endptr, stream_it);
331 // else
332 // throw std::runtime_error{"The energy could not be transformed into a string."};
333 
334  stream_it = ' ';
335  stream_it = '(';
336 
337  std::array<char, 100> str;
338  int len = std::snprintf(str.data(), 100, "%.*f", options.precision, energy);
339  if (len < 0 || len >= 100)
340  throw std::runtime_error{"The energy could not be transformed into a string."};
341  std::ranges::copy(str.data(), str.data() + len, stream_it);
342 
343  stream_it = ')';
344  }
345  }
346  detail::write_eol(stream_it, options.add_carriage_return);
347  }
348  else if constexpr (!detail::decays_to_ignore_v<energy_type>)
349  {
350  throw std::logic_error{"The ENERGY field cannot be written to a Vienna file without providing STRUCTURE."};
351  }
352  }
353 
354 private:
361  template <typename alph_type, typename stream_view_type>
362  auto read_structure(stream_view_type & stream_view)
363  {
364  auto constexpr is_legal_structure = is_in_alphabet<alph_type>;
365  return stream_view | views::take_until(is_space) // until whitespace
366  | std::views::transform([is_legal_structure](char const c)
367  {
368  if (!is_legal_structure(c))
369  {
370  throw parse_error{
371  std::string{"Encountered an unexpected letter: "} +
372  is_legal_structure.msg +
373  " evaluated to false on " + detail::make_printable(c)};
374  }
375  return c;
376  }) // enforce legal alphabet
377  | views::char_to<alph_type>; // convert to actual target alphabet
378  }
379 };
380 
381 } // namespace seqan3::detail
Adaptations of algorithms from the Ranges TS.
Provides alphabet adaptations for standard char types.
Provides seqan3::views::char_to.
The Vienna format (dot bracket notation) for RNA sequences with secondary structure.
Definition: format_vienna.hpp:84
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_vienna.hpp:99
format_vienna() noexcept=default
Defaulted.
void read_structure_record(stream_type &stream, structure_file_input_options< seq_legal_alph_type, structured_seq_combined > const &options, seq_type &seq, id_type &id, bpp_type &bpp, structure_type &structure, energy_type &energy, react_type &react, react_type &react_err, comment_type &comment, offset_type &offset)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_vienna.hpp:118
constexpr sequenced_policy seq
Global execution policy object for sequenced execution policy.
Definition: execution.hpp:54
constexpr auto istreambuf
A view factory that returns a view over the stream buffer of an input stream.
Definition: istreambuf.hpp:113
Provides various utility functions.
Provides seqan3::fast_istreambuf_iterator and seqan3::fast_ostreambuf_iterator, as well as,...
Provides seqan3::views::istreambuf.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
SeqAn specific customisations in the standard namespace.
Provides character predicates for tokenisation.
Provides various utility functions.
Provides various transformation traits used by the range module.
Adaptations of concepts from the Ranges TS.
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:28
Provides seqan3::structure_file_input_format.
Provides seqan3::structure_file_input_options.
Provides seqan3::structure_file_output_format and auxiliary classes.
Provides seqan3::structure_file_output_options.
Provides seqan3::views::take.
Provides seqan3::views::take_line and seqan3::views::take_line_or_throw.
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
Provides seqan3::views::to.
Provides seqan3::views::to_char.
Provides the WUSS format for RNA structure.