Halide  14.0.0
Halide compiler and libraries
Func.h
Go to the documentation of this file.
1 #ifndef HALIDE_FUNC_H
2 #define HALIDE_FUNC_H
3 
4 /** \file
5  *
6  * Defines Func - the front-end handle on a halide function, and related classes.
7  */
8 
9 #include "Argument.h"
10 #include "Expr.h"
11 #include "JITModule.h"
12 #include "Module.h"
13 #include "Param.h"
14 #include "Pipeline.h"
15 #include "RDom.h"
16 #include "Target.h"
17 #include "Tuple.h"
18 #include "Var.h"
19 
20 #include <map>
21 #include <utility>
22 
23 namespace Halide {
24 
25 class OutputImageParam;
26 class ParamMap;
27 
28 /** A class that can represent Vars or RVars. Used for reorder calls
29  * which can accept a mix of either. */
30 struct VarOrRVar {
31  VarOrRVar(const std::string &n, bool r)
32  : var(n), rvar(n), is_rvar(r) {
33  }
34  VarOrRVar(const Var &v)
35  : var(v), is_rvar(false) {
36  }
37  VarOrRVar(const RVar &r)
38  : rvar(r), is_rvar(true) {
39  }
40  VarOrRVar(const RDom &r)
41  : rvar(RVar(r)), is_rvar(true) {
42  }
43  template<int N>
45  : var(u), is_rvar(false) {
46  }
47 
48  const std::string &name() const {
49  if (is_rvar) {
50  return rvar.name();
51  } else {
52  return var.name();
53  }
54  }
55 
58  bool is_rvar;
59 };
60 
61 class ImageParam;
62 
63 namespace Internal {
64 class Function;
65 struct Split;
66 struct StorageDim;
67 } // namespace Internal
68 
69 /** A single definition of a Func. May be a pure or update definition. */
70 class Stage {
71  /** Reference to the Function this stage (or definition) belongs to. */
72  Internal::Function function;
73  Internal::Definition definition;
74  /** Indicate which stage the definition belongs to (0 for initial
75  * definition, 1 for first update, etc.). */
76  size_t stage_index;
77  /** Pure Vars of the Function (from the init definition). */
78  std::vector<Var> dim_vars;
79 
80  void set_dim_type(const VarOrRVar &var, Internal::ForType t);
81  void set_dim_device_api(const VarOrRVar &var, DeviceAPI device_api);
82  void split(const std::string &old, const std::string &outer, const std::string &inner,
83  const Expr &factor, bool exact, TailStrategy tail);
84  void remove(const std::string &var);
85  Stage &purify(const VarOrRVar &old_name, const VarOrRVar &new_name);
86 
87  const std::vector<Internal::StorageDim> &storage_dims() const {
88  return function.schedule().storage_dims();
89  }
90 
91  Stage &compute_with(LoopLevel loop_level, const std::map<std::string, LoopAlignStrategy> &align);
92 
93 public:
94  Stage(Internal::Function f, Internal::Definition d, size_t stage_index)
95  : function(std::move(f)), definition(std::move(d)), stage_index(stage_index) {
96  internal_assert(definition.defined());
97 
98  dim_vars.reserve(function.args().size());
99  for (const auto &arg : function.args()) {
100  dim_vars.emplace_back(arg);
101  }
102  internal_assert(definition.args().size() == dim_vars.size());
103  }
104 
105  /** Return the current StageSchedule associated with this Stage. For
106  * introspection only: to modify schedule, use the Func interface. */
108  return definition.schedule();
109  }
110 
111  /** Return a string describing the current var list taking into
112  * account all the splits, reorders, and tiles. */
113  std::string dump_argument_list() const;
114 
115  /** Return the name of this stage, e.g. "f.update(2)" */
116  std::string name() const;
117 
118  /** Calling rfactor() on an associative update definition a Func will split
119  * the update into an intermediate which computes the partial results and
120  * replaces the current update definition with a new definition which merges
121  * the partial results. If called on a init/pure definition, this will
122  * throw an error. rfactor() will automatically infer the associative reduction
123  * operator and identity of the operator. If it can't prove the operation
124  * is associative or if it cannot find an identity for that operator, this
125  * will throw an error. In addition, commutativity of the operator is required
126  * if rfactor() is called on the inner dimension but excluding the outer
127  * dimensions.
128  *
129  * rfactor() takes as input 'preserved', which is a list of <RVar, Var> pairs.
130  * The rvars not listed in 'preserved' are removed from the original Func and
131  * are lifted to the intermediate Func. The remaining rvars (the ones in
132  * 'preserved') are made pure in the intermediate Func. The intermediate Func's
133  * update definition inherits all scheduling directives (e.g. split,fuse, etc.)
134  * applied to the original Func's update definition. The loop order of the
135  * intermediate Func's update definition is the same as the original, although
136  * the RVars in 'preserved' are replaced by the new pure Vars. The loop order of the
137  * intermediate Func's init definition from innermost to outermost is the args'
138  * order of the original Func's init definition followed by the new pure Vars.
139  *
140  * The intermediate Func also inherits storage order from the original Func
141  * with the new pure Vars added to the outermost.
142  *
143  * For example, f.update(0).rfactor({{r.y, u}}) would rewrite a pipeline like this:
144  \code
145  f(x, y) = 0;
146  f(x, y) += g(r.x, r.y);
147  \endcode
148  * into a pipeline like this:
149  \code
150  f_intm(x, y, u) = 0;
151  f_intm(x, y, u) += g(r.x, u);
152 
153  f(x, y) = 0;
154  f(x, y) += f_intm(x, y, r.y);
155  \endcode
156  *
157  * This has a variety of uses. You can use it to split computation of an associative reduction:
158  \code
159  f(x, y) = 10;
160  RDom r(0, 96);
161  f(x, y) = max(f(x, y), g(x, y, r.x));
162  f.update(0).split(r.x, rxo, rxi, 8).reorder(y, x).parallel(x);
163  f.update(0).rfactor({{rxo, u}}).compute_root().parallel(u).update(0).parallel(u);
164  \endcode
165  *
166  *, which is equivalent to:
167  \code
168  parallel for u = 0 to 11:
169  for y:
170  for x:
171  f_intm(x, y, u) = -inf
172  parallel for x:
173  for y:
174  parallel for u = 0 to 11:
175  for rxi = 0 to 7:
176  f_intm(x, y, u) = max(f_intm(x, y, u), g(8*u + rxi))
177  for y:
178  for x:
179  f(x, y) = 10
180  parallel for x:
181  for y:
182  for rxo = 0 to 11:
183  f(x, y) = max(f(x, y), f_intm(x, y, rxo))
184  \endcode
185  *
186  */
187  // @{
188  Func rfactor(std::vector<std::pair<RVar, Var>> preserved);
189  Func rfactor(const RVar &r, const Var &v);
190  // @}
191 
192  /** Schedule the iteration over this stage to be fused with another
193  * stage 's' from outermost loop to a given LoopLevel. 'this' stage will
194  * be computed AFTER 's' in the innermost fused dimension. There should not
195  * be any dependencies between those two fused stages. If either of the
196  * stages being fused is a stage of an extern Func, this will throw an error.
197  *
198  * Note that the two stages that are fused together should have the same
199  * exact schedule from the outermost to the innermost fused dimension, and
200  * the stage we are calling compute_with on should not have specializations,
201  * e.g. f2.compute_with(f1, x) is allowed only if f2 has no specializations.
202  *
203  * Also, if a producer is desired to be computed at the fused loop level,
204  * the function passed to the compute_at() needs to be the "parent". Consider
205  * the following code:
206  \code
207  input(x, y) = x + y;
208  f(x, y) = input(x, y);
209  f(x, y) += 5;
210  g(x, y) = x - y;
211  g(x, y) += 10;
212  f.compute_with(g, y);
213  f.update().compute_with(g.update(), y);
214  \endcode
215  *
216  * To compute 'input' at the fused loop level at dimension y, we specify
217  * input.compute_at(g, y) instead of input.compute_at(f, y) since 'g' is
218  * the "parent" for this fused loop (i.e. 'g' is computed first before 'f'
219  * is computed). On the other hand, to compute 'input' at the innermost
220  * dimension of 'f', we specify input.compute_at(f, x) instead of
221  * input.compute_at(g, x) since the x dimension of 'f' is not fused
222  * (only the y dimension is).
223  *
224  * Given the constraints, this has a variety of uses. Consider the
225  * following code:
226  \code
227  f(x, y) = x + y;
228  g(x, y) = x - y;
229  h(x, y) = f(x, y) + g(x, y);
230  f.compute_root();
231  g.compute_root();
232  f.split(x, xo, xi, 8);
233  g.split(x, xo, xi, 8);
234  g.compute_with(f, xo);
235  \endcode
236  *
237  * This is equivalent to:
238  \code
239  for y:
240  for xo:
241  for xi:
242  f(8*xo + xi) = (8*xo + xi) + y
243  for xi:
244  g(8*xo + xi) = (8*xo + xi) - y
245  for y:
246  for x:
247  h(x, y) = f(x, y) + g(x, y)
248  \endcode
249  *
250  * The size of the dimensions of the stages computed_with do not have
251  * to match. Consider the following code where 'g' is half the size of 'f':
252  \code
253  Image<int> f_im(size, size), g_im(size/2, size/2);
254  input(x, y) = x + y;
255  f(x, y) = input(x, y);
256  g(x, y) = input(2*x, 2*y);
257  g.compute_with(f, y);
258  input.compute_at(f, y);
259  Pipeline({f, g}).realize({f_im, g_im});
260  \endcode
261  *
262  * This is equivalent to:
263  \code
264  for y = 0 to size-1:
265  for x = 0 to size-1:
266  input(x, y) = x + y;
267  for x = 0 to size-1:
268  f(x, y) = input(x, y)
269  for x = 0 to size/2-1:
270  if (y < size/2-1):
271  g(x, y) = input(2*x, 2*y)
272  \endcode
273  *
274  * 'align' specifies how the loop iteration of each dimension of the
275  * two stages being fused should be aligned in the fused loop nests
276  * (see LoopAlignStrategy for options). Consider the following loop nests:
277  \code
278  for z = f_min_z to f_max_z:
279  for y = f_min_y to f_max_y:
280  for x = f_min_x to f_max_x:
281  f(x, y, z) = x + y + z
282  for z = g_min_z to g_max_z:
283  for y = g_min_y to g_max_y:
284  for x = g_min_x to g_max_x:
285  g(x, y, z) = x - y - z
286  \endcode
287  *
288  * If no alignment strategy is specified, the following loop nest will be
289  * generated:
290  \code
291  for z = min(f_min_z, g_min_z) to max(f_max_z, g_max_z):
292  for y = min(f_min_y, g_min_y) to max(f_max_y, g_max_y):
293  for x = f_min_x to f_max_x:
294  if (f_min_z <= z <= f_max_z):
295  if (f_min_y <= y <= f_max_y):
296  f(x, y, z) = x + y + z
297  for x = g_min_x to g_max_x:
298  if (g_min_z <= z <= g_max_z):
299  if (g_min_y <= y <= g_max_y):
300  g(x, y, z) = x - y - z
301  \endcode
302  *
303  * Instead, these alignment strategies:
304  \code
305  g.compute_with(f, y, {{z, LoopAlignStrategy::AlignStart}, {y, LoopAlignStrategy::AlignEnd}});
306  \endcode
307  * will produce the following loop nest:
308  \code
309  f_loop_min_z = f_min_z
310  f_loop_max_z = max(f_max_z, (f_min_z - g_min_z) + g_max_z)
311  for z = f_min_z to f_loop_max_z:
312  f_loop_min_y = min(f_min_y, (f_max_y - g_max_y) + g_min_y)
313  f_loop_max_y = f_max_y
314  for y = f_loop_min_y to f_loop_max_y:
315  for x = f_min_x to f_max_x:
316  if (f_loop_min_z <= z <= f_loop_max_z):
317  if (f_loop_min_y <= y <= f_loop_max_y):
318  f(x, y, z) = x + y + z
319  for x = g_min_x to g_max_x:
320  g_shift_z = g_min_z - f_loop_min_z
321  g_shift_y = g_max_y - f_loop_max_y
322  if (g_min_z <= (z + g_shift_z) <= g_max_z):
323  if (g_min_y <= (y + g_shift_y) <= g_max_y):
324  g(x, y + g_shift_y, z + g_shift_z) = x - (y + g_shift_y) - (z + g_shift_z)
325  \endcode
326  *
327  * LoopAlignStrategy::AlignStart on dimension z will shift the loop iteration
328  * of 'g' at dimension z so that its starting value matches that of 'f'.
329  * Likewise, LoopAlignStrategy::AlignEnd on dimension y will shift the loop
330  * iteration of 'g' at dimension y so that its end value matches that of 'f'.
331  */
332  // @{
333  Stage &compute_with(LoopLevel loop_level, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
335  Stage &compute_with(const Stage &s, const VarOrRVar &var, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
337  // @}
338 
339  /** Scheduling calls that control how the domain of this stage is
340  * traversed. See the documentation for Func for the meanings. */
341  // @{
342 
343  Stage &split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
344  Stage &fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused);
345  Stage &serial(const VarOrRVar &var);
346  Stage &parallel(const VarOrRVar &var);
347  Stage &vectorize(const VarOrRVar &var);
348  Stage &unroll(const VarOrRVar &var);
349  Stage &parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail = TailStrategy::Auto);
350  Stage &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
351  Stage &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
352  Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
353  const VarOrRVar &xo, const VarOrRVar &yo,
354  const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor,
356  Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
357  const VarOrRVar &xi, const VarOrRVar &yi,
358  const Expr &xfactor, const Expr &yfactor,
360  Stage &tile(const std::vector<VarOrRVar> &previous,
361  const std::vector<VarOrRVar> &outers,
362  const std::vector<VarOrRVar> &inners,
363  const std::vector<Expr> &factors,
364  const std::vector<TailStrategy> &tails);
365  Stage &tile(const std::vector<VarOrRVar> &previous,
366  const std::vector<VarOrRVar> &outers,
367  const std::vector<VarOrRVar> &inners,
368  const std::vector<Expr> &factors,
370  Stage &tile(const std::vector<VarOrRVar> &previous,
371  const std::vector<VarOrRVar> &inners,
372  const std::vector<Expr> &factors,
374  Stage &reorder(const std::vector<VarOrRVar> &vars);
375 
376  template<typename... Args>
377  HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>::type
378  reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args) {
379  std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
380  return reorder(collected_args);
381  }
382 
383  Stage &rename(const VarOrRVar &old_name, const VarOrRVar &new_name);
384  Stage specialize(const Expr &condition);
385  void specialize_fail(const std::string &message);
386 
387  Stage &gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
388  Stage &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
389  Stage &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
390 
391  Stage &gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
392 
394 
396  Stage &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
397  Stage &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
398 
399  Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
400  Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y,
401  const VarOrRVar &thread_x, const VarOrRVar &thread_y,
402  DeviceAPI device_api = DeviceAPI::Default_GPU);
403  Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z,
404  const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z,
405  DeviceAPI device_api = DeviceAPI::Default_GPU);
406 
407  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size,
409  DeviceAPI device_api = DeviceAPI::Default_GPU);
410 
411  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size,
413  DeviceAPI device_api = DeviceAPI::Default_GPU);
414  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
415  const VarOrRVar &bx, const VarOrRVar &by,
416  const VarOrRVar &tx, const VarOrRVar &ty,
417  const Expr &x_size, const Expr &y_size,
419  DeviceAPI device_api = DeviceAPI::Default_GPU);
420 
421  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
422  const VarOrRVar &tx, const VarOrRVar &ty,
423  const Expr &x_size, const Expr &y_size,
425  DeviceAPI device_api = DeviceAPI::Default_GPU);
426 
427  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
428  const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz,
429  const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
430  const Expr &x_size, const Expr &y_size, const Expr &z_size,
432  DeviceAPI device_api = DeviceAPI::Default_GPU);
433  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
434  const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
435  const Expr &x_size, const Expr &y_size, const Expr &z_size,
437  DeviceAPI device_api = DeviceAPI::Default_GPU);
438 
440  Stage &atomic(bool override_associativity_test = false);
441 
443 
444  HALIDE_ATTRIBUTE_DEPRECATED("Call prefetch() with the two-var form instead.")
445  Stage &prefetch(const Func &f, const VarOrRVar &var, int offset = 1,
446  PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
447  return prefetch(f, var, var, offset, strategy);
448  }
449  HALIDE_ATTRIBUTE_DEPRECATED("Call prefetch() with the two-var form instead.")
450  Stage &prefetch(const Internal::Parameter &param, const VarOrRVar &var, int offset = 1,
451  PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
452  return prefetch(param, var, var, offset, strategy);
453  }
454  template<typename T>
455  HALIDE_ATTRIBUTE_DEPRECATED("Call prefetch() with the two-var form instead.")
456  Stage &prefetch(const T &image, VarOrRVar var, int offset = 1,
457  PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
458  return prefetch(image.parameter(), var, var, offset, strategy);
459  }
460  Stage &prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
462  Stage &prefetch(const Internal::Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
464  template<typename T>
465  Stage &prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
467  return prefetch(image.parameter(), at, from, std::move(offset), strategy);
468  }
469  // @}
470 
471  /** Attempt to get the source file and line where this stage was
472  * defined by parsing the process's own debug symbols. Returns an
473  * empty string if no debug symbols were found or the debug
474  * symbols were not understood. Works on OS X and Linux only. */
475  std::string source_location() const;
476 
477  /** Assert that this stage has intentionally been given no schedule, and
478  * suppress the warning about unscheduled update definitions that would
479  * otherwise fire. This counts as a schedule, so calling this twice on the
480  * same Stage will fail the assertion. */
481  void unscheduled();
482 };
483 
484 // For backwards compatibility, keep the ScheduleHandle name.
486 
487 class FuncTupleElementRef;
488 
489 /** A fragment of front-end syntax of the form f(x, y, z), where x, y,
490  * z are Vars or Exprs. If could be the left hand side of a definition or
491  * an update definition, or it could be a call to a function. We don't know
492  * until we see how this object gets used.
493  */
494 class FuncRef {
495  Internal::Function func;
496  int implicit_placeholder_pos;
497  int implicit_count;
498  std::vector<Expr> args;
499  std::vector<Expr> args_with_implicit_vars(const std::vector<Expr> &e) const;
500 
501  /** Helper for function update by Tuple. If the function does not
502  * already have a pure definition, init_val will be used as RHS of
503  * each tuple element in the initial function definition. */
504  template<typename BinaryOp>
505  Stage func_ref_update(const Tuple &e, int init_val);
506 
507  /** Helper for function update by Expr. If the function does not
508  * already have a pure definition, init_val will be used as RHS in
509  * the initial function definition. */
510  template<typename BinaryOp>
511  Stage func_ref_update(Expr e, int init_val);
512 
513 public:
514  FuncRef(const Internal::Function &, const std::vector<Expr> &,
515  int placeholder_pos = -1, int count = 0);
516  FuncRef(Internal::Function, const std::vector<Var> &,
517  int placeholder_pos = -1, int count = 0);
518 
519  /** Use this as the left-hand-side of a definition or an update definition
520  * (see \ref RDom).
521  */
522  Stage operator=(const Expr &);
523 
524  /** Use this as the left-hand-side of a definition or an update definition
525  * for a Func with multiple outputs. */
527 
528  /** Define a stage that adds the given expression to this Func. If the
529  * expression refers to some RDom, this performs a sum reduction of the
530  * expression over the domain. If the function does not already have a
531  * pure definition, this sets it to zero.
532  */
533  // @{
537  // @}
538 
539  /** Define a stage that adds the negative of the given expression to this
540  * Func. If the expression refers to some RDom, this performs a sum reduction
541  * of the negative of the expression over the domain. If the function does
542  * not already have a pure definition, this sets it to zero.
543  */
544  // @{
548  // @}
549 
550  /** Define a stage that multiplies this Func by the given expression. If the
551  * expression refers to some RDom, this performs a product reduction of the
552  * expression over the domain. If the function does not already have a pure
553  * definition, this sets it to 1.
554  */
555  // @{
559  // @}
560 
561  /** Define a stage that divides this Func by the given expression.
562  * If the expression refers to some RDom, this performs a product
563  * reduction of the inverse of the expression over the domain. If the
564  * function does not already have a pure definition, this sets it to 1.
565  */
566  // @{
570  // @}
571 
572  /* Override the usual assignment operator, so that
573  * f(x, y) = g(x, y) defines f.
574  */
576 
577  /** Use this as a call to the function, and not the left-hand-side
578  * of a definition. Only works for single-output Funcs. */
579  operator Expr() const;
580 
581  /** When a FuncRef refers to a function that provides multiple
582  * outputs, you can access each output as an Expr using
583  * operator[].
584  */
586 
587  /** How many outputs does the function this refers to produce. */
588  size_t size() const;
589 
590  /** What function is this calling? */
591  Internal::Function function() const {
592  return func;
593  }
594 };
595 
596 /** Explicit overloads of min and max for FuncRef. These exist to
597  * disambiguate calls to min on FuncRefs when a user has pulled both
598  * Halide::min and std::min into their namespace. */
599 // @{
600 inline Expr min(const FuncRef &a, const FuncRef &b) {
601  return min(Expr(a), Expr(b));
602 }
603 inline Expr max(const FuncRef &a, const FuncRef &b) {
604  return max(Expr(a), Expr(b));
605 }
606 // @}
607 
608 /** A fragment of front-end syntax of the form f(x, y, z)[index], where x, y,
609  * z are Vars or Exprs. If could be the left hand side of an update
610  * definition, or it could be a call to a function. We don't know
611  * until we see how this object gets used.
612  */
614  FuncRef func_ref;
615  std::vector<Expr> args; // args to the function
616  int idx; // Index to function outputs
617 
618  /** Helper function that generates a Tuple where element at 'idx' is set
619  * to 'e' and the rests are undef. */
620  Tuple values_with_undefs(const Expr &e) const;
621 
622 public:
623  FuncTupleElementRef(const FuncRef &ref, const std::vector<Expr> &args, int idx);
624 
625  /** Use this as the left-hand-side of an update definition of Tuple
626  * component 'idx' of a Func (see \ref RDom). The function must
627  * already have an initial definition.
628  */
629  Stage operator=(const Expr &e);
630 
631  /** Define a stage that adds the given expression to Tuple component 'idx'
632  * of this Func. The other Tuple components are unchanged. If the expression
633  * refers to some RDom, this performs a sum reduction of the expression over
634  * the domain. The function must already have an initial definition.
635  */
636  Stage operator+=(const Expr &e);
637 
638  /** Define a stage that adds the negative of the given expression to Tuple
639  * component 'idx' of this Func. The other Tuple components are unchanged.
640  * If the expression refers to some RDom, this performs a sum reduction of
641  * the negative of the expression over the domain. The function must already
642  * have an initial definition.
643  */
644  Stage operator-=(const Expr &e);
645 
646  /** Define a stage that multiplies Tuple component 'idx' of this Func by
647  * the given expression. The other Tuple components are unchanged. If the
648  * expression refers to some RDom, this performs a product reduction of
649  * the expression over the domain. The function must already have an
650  * initial definition.
651  */
652  Stage operator*=(const Expr &e);
653 
654  /** Define a stage that divides Tuple component 'idx' of this Func by
655  * the given expression. The other Tuple components are unchanged.
656  * If the expression refers to some RDom, this performs a product
657  * reduction of the inverse of the expression over the domain. The function
658  * must already have an initial definition.
659  */
660  Stage operator/=(const Expr &e);
661 
662  /* Override the usual assignment operator, so that
663  * f(x, y)[index] = g(x, y) defines f.
664  */
666 
667  /** Use this as a call to Tuple component 'idx' of a Func, and not the
668  * left-hand-side of a definition. */
669  operator Expr() const;
670 
671  /** What function is this calling? */
672  Internal::Function function() const {
673  return func_ref.function();
674  }
675 
676  /** Return index to the function outputs. */
677  int index() const {
678  return idx;
679  }
680 };
681 
682 namespace Internal {
683 class IRMutator;
684 } // namespace Internal
685 
686 /** Helper class for identifying purpose of an Expr passed to memoize.
687  */
688 class EvictionKey {
689 protected:
691  friend class Func;
692 
693 public:
694  explicit EvictionKey(const Expr &expr = Expr())
695  : key(expr) {
696  }
697 };
698 
699 /** A halide function. This class represents one stage in a Halide
700  * pipeline, and is the unit by which we schedule things. By default
701  * they are aggressively inlined, so you are encouraged to make lots
702  * of little functions, rather than storing things in Exprs. */
703 class Func {
704 
705  /** A handle on the internal halide function that this
706  * represents */
707  Internal::Function func;
708 
709  /** When you make a reference to this function with fewer
710  * arguments than it has dimensions, the argument list is bulked
711  * up with 'implicit' vars with canonical names. This lets you
712  * pass around partially applied Halide functions. */
713  // @{
714  std::pair<int, int> add_implicit_vars(std::vector<Var> &) const;
715  std::pair<int, int> add_implicit_vars(std::vector<Expr> &) const;
716  // @}
717 
718  /** The imaging pipeline that outputs this Func alone. */
719  Pipeline pipeline_;
720 
721  /** Get the imaging pipeline that outputs this Func alone,
722  * creating it (and freezing the Func) if necessary. */
723  Pipeline pipeline();
724 
725  // Helper function for recursive reordering support
726  Func &reorder_storage(const std::vector<Var> &dims, size_t start);
727 
728  void invalidate_cache();
729 
730 public:
731  /** Declare a new undefined function with the given name */
732  explicit Func(const std::string &name);
733 
734  /** Declare a new undefined function with an
735  * automatically-generated unique name */
736  Func();
737 
738  /** Declare a new function with an automatically-generated unique
739  * name, and define it to return the given expression (which may
740  * not contain free variables). */
741  explicit Func(const Expr &e);
742 
743  /** Construct a new Func to wrap an existing, already-define
744  * Function object. */
746 
747  /** Construct a new Func to wrap a Buffer. */
748  template<typename T, int Dims>
750  : Func() {
751  (*this)(_) = im(_);
752  }
753 
754  /** Evaluate this function over some rectangular domain and return
755  * the resulting buffer or buffers. Performs compilation if the
756  * Func has not previously been realized and compile_jit has not
757  * been called. If the final stage of the pipeline is on the GPU,
758  * data is copied back to the host before being returned. The
759  * returned Realization should probably be instantly converted to
760  * a Buffer class of the appropriate type. That is, do this:
761  *
762  \code
763  f(x) = sin(x);
764  Buffer<float> im = f.realize(...);
765  \endcode
766  *
767  * If your Func has multiple values, because you defined it using
768  * a Tuple, then casting the result of a realize call to a buffer
769  * or image will produce a run-time error. Instead you should do the
770  * following:
771  *
772  \code
773  f(x) = Tuple(x, sin(x));
774  Realization r = f.realize(...);
775  Buffer<int> im0 = r[0];
776  Buffer<float> im1 = r[1];
777  \endcode
778  *
779  * In Halide formal arguments of a computation are specified using
780  * Param<T> and ImageParam objects in the expressions defining the
781  * computation. The param_map argument to realize allows
782  * specifying a set of per-call parameters to be used for a
783  * specific computation. This method is thread-safe where the
784  * globals used by Param<T> and ImageParam are not. Any parameters
785  * that are not in the param_map are taken from the global values,
786  * so those can continue to be used if they are not changing
787  * per-thread.
788  *
789  * One can explicitly construct a ParamMap and
790  * use its set method to insert Parameter to scalar or Buffer
791  * value mappings:
792  *
793  \code
794  Param<int32> p(42);
795  ImageParam img(Int(32), 1);
796  f(x) = img(x) + p;
797 
798  Buffer<int32_t) arg_img(10, 10);
799  <fill in arg_img...>
800  ParamMap params;
801  params.set(p, 17);
802  params.set(img, arg_img);
803 
804  Target t = get_jit_target_from_environment();
805  Buffer<int32_t> result = f.realize({10, 10}, t, params);
806  \endcode
807  *
808  * Alternatively, an initializer list can be used
809  * directly in the realize call to pass this information:
810  *
811  \code
812  Param<int32> p(42);
813  ImageParam img(Int(32), 1);
814  f(x) = img(x) + p;
815 
816  Buffer<int32_t) arg_img(10, 10);
817  <fill in arg_img...>
818 
819  Target t = get_jit_target_from_environment();
820  Buffer<int32_t> result = f.realize({10, 10}, t, { { p, 17 }, { img, arg_img } });
821  \endcode
822  *
823  * If the Func cannot be realized into a buffer of the given size
824  * due to scheduling constraints on scattering update definitions,
825  * it will be realized into a larger buffer of the minimum size
826  * possible, and a cropped view at the requested size will be
827  * returned. It is thus not safe to assume the returned buffers
828  * are contiguous in memory. This behavior can be disabled with
829  * the NoBoundsQuery target flag, in which case an error about
830  * writing out of bounds on the output buffer will trigger
831  * instead.
832  *
833  */
834  Realization realize(std::vector<int32_t> sizes = {}, const Target &target = Target(),
835  const ParamMap &param_map = ParamMap::empty_map());
836 
837  /** Same as above, but takes a custom user-provided context to be
838  * passed to runtime functions. This can be used to pass state to
839  * runtime overrides in a thread-safe manner. A nullptr context is
840  * legal, and is equivalent to calling the variant of realize
841  * that does not take a context. */
843  std::vector<int32_t> sizes = {},
844  const Target &target = Target(),
845  const ParamMap &param_map = ParamMap::empty_map());
846 
847  /** Evaluate this function into an existing allocated buffer or
848  * buffers. If the buffer is also one of the arguments to the
849  * function, strange things may happen, as the pipeline isn't
850  * necessarily safe to run in-place. If you pass multiple buffers,
851  * they must have matching sizes. This form of realize does *not*
852  * automatically copy data back from the GPU. */
854  const ParamMap &param_map = ParamMap::empty_map());
855 
856  /** Same as above, but takes a custom user-provided context to be
857  * passed to runtime functions. This can be used to pass state to
858  * runtime overrides in a thread-safe manner. A nullptr context is
859  * legal, and is equivalent to calling the variant of realize
860  * that does not take a context. */
861  void realize(JITUserContext *context,
863  const Target &target = Target(),
864  const ParamMap &param_map = ParamMap::empty_map());
865 
866  /** For a given size of output, or a given output buffer,
867  * determine the bounds required of all unbound ImageParams
868  * referenced. Communicates the result by allocating new buffers
869  * of the appropriate size and binding them to the unbound
870  * ImageParams.
871  *
872  * Set the documentation for Func::realize regarding the
873  * ParamMap. There is one difference in that input Buffer<>
874  * arguments that are being inferred are specified as a pointer to
875  * the Buffer<> in the ParamMap. E.g.
876  *
877  \code
878  Param<int32> p(42);
879  ImageParam img(Int(32), 1);
880  f(x) = img(x) + p;
881 
882  Target t = get_jit_target_from_environment();
883  Buffer<> in;
884  f.infer_input_bounds({10, 10}, t, { { img, &in } });
885  \endcode
886  * On return, in will be an allocated buffer of the correct size
887  * to evaulate f over a 10x10 region.
888  */
889  // @{
890  void infer_input_bounds(const std::vector<int32_t> &sizes,
891  const Target &target = get_jit_target_from_environment(),
892  const ParamMap &param_map = ParamMap::empty_map());
894  const Target &target = get_jit_target_from_environment(),
895  const ParamMap &param_map = ParamMap::empty_map());
896  // @}
897 
898  /** Versions of infer_input_bounds that take a custom user context
899  * to pass to runtime functions. */
900  // @{
902  const std::vector<int32_t> &sizes,
903  const Target &target = get_jit_target_from_environment(),
904  const ParamMap &param_map = ParamMap::empty_map());
907  const Target &target = get_jit_target_from_environment(),
908  const ParamMap &param_map = ParamMap::empty_map());
909  // @}
910  /** Statically compile this function to llvm bitcode, with the
911  * given filename (which should probably end in .bc), type
912  * signature, and C function name (which defaults to the same name
913  * as this halide function */
914  //@{
915  void compile_to_bitcode(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
916  const Target &target = get_target_from_environment());
917  void compile_to_bitcode(const std::string &filename, const std::vector<Argument> &,
918  const Target &target = get_target_from_environment());
919  // @}
920 
921  /** Statically compile this function to llvm assembly, with the
922  * given filename (which should probably end in .ll), type
923  * signature, and C function name (which defaults to the same name
924  * as this halide function */
925  //@{
926  void compile_to_llvm_assembly(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
927  const Target &target = get_target_from_environment());
928  void compile_to_llvm_assembly(const std::string &filename, const std::vector<Argument> &,
929  const Target &target = get_target_from_environment());
930  // @}
931 
932  /** Statically compile this function to an object file, with the
933  * given filename (which should probably end in .o or .obj), type
934  * signature, and C function name (which defaults to the same name
935  * as this halide function. You probably don't want to use this
936  * directly; call compile_to_static_library or compile_to_file instead. */
937  //@{
938  void compile_to_object(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
939  const Target &target = get_target_from_environment());
940  void compile_to_object(const std::string &filename, const std::vector<Argument> &,
941  const Target &target = get_target_from_environment());
942  // @}
943 
944  /** Emit a header file with the given filename for this
945  * function. The header will define a function with the type
946  * signature given by the second argument, and a name given by the
947  * third. The name defaults to the same name as this halide
948  * function. You don't actually have to have defined this function
949  * yet to call this. You probably don't want to use this directly;
950  * call compile_to_static_library or compile_to_file instead. */
951  void compile_to_header(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name = "",
952  const Target &target = get_target_from_environment());
953 
954  /** Statically compile this function to text assembly equivalent
955  * to the object file generated by compile_to_object. This is
956  * useful for checking what Halide is producing without having to
957  * disassemble anything, or if you need to feed the assembly into
958  * some custom toolchain to produce an object file (e.g. iOS) */
959  //@{
960  void compile_to_assembly(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
961  const Target &target = get_target_from_environment());
962  void compile_to_assembly(const std::string &filename, const std::vector<Argument> &,
963  const Target &target = get_target_from_environment());
964  // @}
965 
966  /** Statically compile this function to C source code. This is
967  * useful for providing fallback code paths that will compile on
968  * many platforms. Vectorization will fail, and parallelization
969  * will produce serial code. */
970  void compile_to_c(const std::string &filename,
971  const std::vector<Argument> &,
972  const std::string &fn_name = "",
973  const Target &target = get_target_from_environment());
974 
975  /** Write out an internal representation of lowered code. Useful
976  * for analyzing and debugging scheduling. Can emit html or plain
977  * text. */
978  void compile_to_lowered_stmt(const std::string &filename,
979  const std::vector<Argument> &args,
980  StmtOutputFormat fmt = Text,
981  const Target &target = get_target_from_environment());
982 
983  /** Write out the loop nests specified by the schedule for this
984  * Function. Helpful for understanding what a schedule is
985  * doing. */
987 
988  /** Compile to object file and header pair, with the given
989  * arguments. The name defaults to the same name as this halide
990  * function.
991  */
992  void compile_to_file(const std::string &filename_prefix, const std::vector<Argument> &args,
993  const std::string &fn_name = "",
994  const Target &target = get_target_from_environment());
995 
996  /** Compile to static-library file and header pair, with the given
997  * arguments. The name defaults to the same name as this halide
998  * function.
999  */
1000  void compile_to_static_library(const std::string &filename_prefix, const std::vector<Argument> &args,
1001  const std::string &fn_name = "",
1002  const Target &target = get_target_from_environment());
1003 
1004  /** Compile to static-library file and header pair once for each target;
1005  * each resulting function will be considered (in order) via halide_can_use_target_features()
1006  * at runtime, with the first appropriate match being selected for subsequent use.
1007  * This is typically useful for specializations that may vary unpredictably by machine
1008  * (e.g., SSE4.1/AVX/AVX2 on x86 desktop machines).
1009  * All targets must have identical arch-os-bits.
1010  */
1011  void compile_to_multitarget_static_library(const std::string &filename_prefix,
1012  const std::vector<Argument> &args,
1013  const std::vector<Target> &targets);
1014 
1015  /** Like compile_to_multitarget_static_library(), except that the object files
1016  * are all output as object files (rather than bundled into a static library).
1017  *
1018  * `suffixes` is an optional list of strings to use for as the suffix for each object
1019  * file. If nonempty, it must be the same length as `targets`. (If empty, Target::to_string()
1020  * will be used for each suffix.)
1021  *
1022  * Note that if `targets.size()` > 1, the wrapper code (to select the subtarget)
1023  * will be generated with the filename `${filename_prefix}_wrapper.o`
1024  *
1025  * Note that if `targets.size()` > 1 and `no_runtime` is not specified, the runtime
1026  * will be generated with the filename `${filename_prefix}_runtime.o`
1027  */
1028  void compile_to_multitarget_object_files(const std::string &filename_prefix,
1029  const std::vector<Argument> &args,
1030  const std::vector<Target> &targets,
1031  const std::vector<std::string> &suffixes);
1032 
1033  /** Store an internal representation of lowered code as a self
1034  * contained Module suitable for further compilation. */
1035  Module compile_to_module(const std::vector<Argument> &args, const std::string &fn_name = "",
1036  const Target &target = get_target_from_environment());
1037 
1038  /** Compile and generate multiple target files with single call.
1039  * Deduces target files based on filenames specified in
1040  * output_files map.
1041  */
1042  void compile_to(const std::map<OutputFileType, std::string> &output_files,
1043  const std::vector<Argument> &args,
1044  const std::string &fn_name,
1045  const Target &target = get_target_from_environment());
1046 
1047  /** Eagerly jit compile the function to machine code. This
1048  * normally happens on the first call to realize. If you're
1049  * running your halide pipeline inside time-sensitive code and
1050  * wish to avoid including the time taken to compile a pipeline,
1051  * then you can call this ahead of time. Default is to use the Target
1052  * returned from Halide::get_jit_target_from_environment()
1053  */
1055 
1056  /** Deprecated variants of the above that use a void pointer
1057  * instead of a JITUserContext pointer. */
1058  // @{
1059  HALIDE_ATTRIBUTE_DEPRECATED("Custom handlers should by set by modifying the struct returned by jit_handlers()")
1060  void set_error_handler(void (*handler)(void *, const char *));
1061  HALIDE_ATTRIBUTE_DEPRECATED("Custom handlers should by set by modifying the struct returned by jit_handlers()")
1062  void set_custom_allocator(void *(*malloc)(void *, size_t),
1063  void (*free)(void *, void *));
1064  HALIDE_ATTRIBUTE_DEPRECATED("Custom handlers should by set by modifying the struct returned by jit_handlers()")
1066  int (*custom_do_task)(void *, int (*)(void *, int, uint8_t *),
1067  int, uint8_t *));
1068  HALIDE_ATTRIBUTE_DEPRECATED("Custom handlers should by set by modifying the struct returned by jit_handlers()")
1070  int (*custom_do_par_for)(void *, int (*)(void *, int, uint8_t *), int,
1071  int, uint8_t *));
1072  HALIDE_ATTRIBUTE_DEPRECATED("Custom handlers should by set by modifying the struct returned by jit_handlers()")
1073  void set_custom_trace(int (*trace_fn)(void *, const halide_trace_event_t *));
1074 
1075  HALIDE_ATTRIBUTE_DEPRECATED("Custom handlers should by set by modifying the struct returned by jit_handlers()")
1076  void set_custom_print(void (*handler)(void *, const char *));
1077  // @}
1078 
1079  /** Get a struct containing the currently set custom functions
1080  * used by JIT. This can be mutated. Changes will take effect the
1081  * next time this Func is realized. */
1083 
1084  /** Add a custom pass to be used during lowering. It is run after
1085  * all other lowering passes. Can be used to verify properties of
1086  * the lowered Stmt, instrument it with extra code, or otherwise
1087  * modify it. The Func takes ownership of the pass, and will call
1088  * delete on it when the Func goes out of scope. So don't pass a
1089  * stack object, or share pass instances between multiple
1090  * Funcs. */
1091  template<typename T>
1093  // Template instantiate a custom deleter for this type, then
1094  // wrap in a lambda. The custom deleter lives in user code, so
1095  // that deletion is on the same heap as construction (I hate Windows).
1096  add_custom_lowering_pass(pass, [pass]() { delete_lowering_pass<T>(pass); });
1097  }
1098 
1099  /** Add a custom pass to be used during lowering, with the
1100  * function that will be called to delete it also passed in. Set
1101  * it to nullptr if you wish to retain ownership of the object. */
1102  void add_custom_lowering_pass(Internal::IRMutator *pass, std::function<void()> deleter);
1103 
1104  /** Remove all previously-set custom lowering passes */
1106 
1107  /** Get the custom lowering passes. */
1108  const std::vector<CustomLoweringPass> &custom_lowering_passes();
1109 
1110  /** When this function is compiled, include code that dumps its
1111  * values to a file after it is realized, for the purpose of
1112  * debugging.
1113  *
1114  * If filename ends in ".tif" or ".tiff" (case insensitive) the file
1115  * is in TIFF format and can be read by standard tools. Oherwise, the
1116  * file format is as follows:
1117  *
1118  * All data is in the byte-order of the target platform. First, a
1119  * 20 byte-header containing four 32-bit ints, giving the extents
1120  * of the first four dimensions. Dimensions beyond four are
1121  * folded into the fourth. Then, a fifth 32-bit int giving the
1122  * data type of the function. The typecodes are given by: float =
1123  * 0, double = 1, uint8_t = 2, int8_t = 3, uint16_t = 4, int16_t =
1124  * 5, uint32_t = 6, int32_t = 7, uint64_t = 8, int64_t = 9. The
1125  * data follows the header, as a densely packed array of the given
1126  * size and the given type. If given the extension .tmp, this file
1127  * format can be natively read by the program ImageStack. */
1128  void debug_to_file(const std::string &filename);
1129 
1130  /** The name of this function, either given during construction,
1131  * or automatically generated. */
1132  const std::string &name() const;
1133 
1134  /** Get the pure arguments. */
1135  std::vector<Var> args() const;
1136 
1137  /** The right-hand-side value of the pure definition of this
1138  * function. Causes an error if there's no pure definition, or if
1139  * the function is defined to return multiple values. */
1140  Expr value() const;
1141 
1142  /** The values returned by this function. An error if the function
1143  * has not been been defined. Returns a Tuple with one element for
1144  * functions defined to return a single value. */
1145  Tuple values() const;
1146 
1147  /** Does this function have at least a pure definition. */
1148  bool defined() const;
1149 
1150  /** Get the left-hand-side of the update definition. An empty
1151  * vector if there's no update definition. If there are
1152  * multiple update definitions for this function, use the
1153  * argument to select which one you want. */
1154  const std::vector<Expr> &update_args(int idx = 0) const;
1155 
1156  /** Get the right-hand-side of an update definition. An error if
1157  * there's no update definition. If there are multiple
1158  * update definitions for this function, use the argument to
1159  * select which one you want. */
1160  Expr update_value(int idx = 0) const;
1161 
1162  /** Get the right-hand-side of an update definition for
1163  * functions that returns multiple values. An error if there's no
1164  * update definition. Returns a Tuple with one element for
1165  * functions that return a single value. */
1166  Tuple update_values(int idx = 0) const;
1167 
1168  /** Get the RVars of the reduction domain for an update definition, if there is
1169  * one. */
1170  std::vector<RVar> rvars(int idx = 0) const;
1171 
1172  /** Does this function have at least one update definition? */
1174 
1175  /** How many update definitions does this function have? */
1177 
1178  /** Is this function an external stage? That is, was it defined
1179  * using define_extern? */
1180  bool is_extern() const;
1181 
1182  /** Add an extern definition for this Func. This lets you define a
1183  * Func that represents an external pipeline stage. You can, for
1184  * example, use it to wrap a call to an extern library such as
1185  * fftw. */
1186  // @{
1187  void define_extern(const std::string &function_name,
1188  const std::vector<ExternFuncArgument> &params, Type t,
1189  int dimensionality,
1191  DeviceAPI device_api = DeviceAPI::Host) {
1192  define_extern(function_name, params, t,
1193  Internal::make_argument_list(dimensionality), mangling,
1194  device_api);
1195  }
1196 
1197  void define_extern(const std::string &function_name,
1198  const std::vector<ExternFuncArgument> &params,
1199  const std::vector<Type> &types, int dimensionality,
1200  NameMangling mangling) {
1201  define_extern(function_name, params, types,
1202  Internal::make_argument_list(dimensionality), mangling);
1203  }
1204 
1205  void define_extern(const std::string &function_name,
1206  const std::vector<ExternFuncArgument> &params,
1207  const std::vector<Type> &types, int dimensionality,
1209  DeviceAPI device_api = DeviceAPI::Host) {
1210  define_extern(function_name, params, types,
1211  Internal::make_argument_list(dimensionality), mangling,
1212  device_api);
1213  }
1214 
1215  void define_extern(const std::string &function_name,
1216  const std::vector<ExternFuncArgument> &params, Type t,
1217  const std::vector<Var> &arguments,
1219  DeviceAPI device_api = DeviceAPI::Host) {
1220  define_extern(function_name, params, std::vector<Type>{t}, arguments,
1221  mangling, device_api);
1222  }
1223 
1224  void define_extern(const std::string &function_name,
1225  const std::vector<ExternFuncArgument> &params,
1226  const std::vector<Type> &types,
1227  const std::vector<Var> &arguments,
1229  DeviceAPI device_api = DeviceAPI::Host);
1230  // @}
1231 
1232  /** Get the types of the outputs of this Func. */
1233  const std::vector<Type> &output_types() const;
1234 
1235  /** Get the number of outputs of this Func. Corresponds to the
1236  * size of the Tuple this Func was defined to return. */
1237  int outputs() const;
1238 
1239  /** Get the name of the extern function called for an extern
1240  * definition. */
1241  const std::string &extern_function_name() const;
1242 
1243  /** The dimensionality (number of arguments) of this
1244  * function. Zero if the function is not yet defined. */
1245  int dimensions() const;
1246 
1247  /** Construct either the left-hand-side of a definition, or a call
1248  * to a functions that happens to only contain vars as
1249  * arguments. If the function has already been defined, and fewer
1250  * arguments are given than the function has dimensions, then
1251  * enough implicit vars are added to the end of the argument list
1252  * to make up the difference (see \ref Var::implicit) */
1253  // @{
1254  FuncRef operator()(std::vector<Var>) const;
1255 
1256  template<typename... Args>
1257  HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<Var, Args...>::value, FuncRef>::type
1258  operator()(Args &&...args) const {
1259  std::vector<Var> collected_args{std::forward<Args>(args)...};
1260  return this->operator()(collected_args);
1261  }
1262  // @}
1263 
1264  /** Either calls to the function, or the left-hand-side of
1265  * an update definition (see \ref RDom). If the function has
1266  * already been defined, and fewer arguments are given than the
1267  * function has dimensions, then enough implicit vars are added to
1268  * the end of the argument list to make up the difference. (see
1269  * \ref Var::implicit)*/
1270  // @{
1271  FuncRef operator()(std::vector<Expr>) const;
1272 
1273  template<typename... Args>
1274  HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<Expr, Args...>::value, FuncRef>::type
1275  operator()(const Expr &x, Args &&...args) const {
1276  std::vector<Expr> collected_args{x, std::forward<Args>(args)...};
1277  return (*this)(collected_args);
1278  }
1279  // @}
1280 
1281  /** Creates and returns a new identity Func that wraps this Func. During
1282  * compilation, Halide replaces all calls to this Func done by 'f'
1283  * with calls to the wrapper. If this Func is already wrapped for
1284  * use in 'f', will return the existing wrapper.
1285  *
1286  * For example, g.in(f) would rewrite a pipeline like this:
1287  \code
1288  g(x, y) = ...
1289  f(x, y) = ... g(x, y) ...
1290  \endcode
1291  * into a pipeline like this:
1292  \code
1293  g(x, y) = ...
1294  g_wrap(x, y) = g(x, y)
1295  f(x, y) = ... g_wrap(x, y)
1296  \endcode
1297  *
1298  * This has a variety of uses. You can use it to schedule this
1299  * Func differently in the different places it is used:
1300  \code
1301  g(x, y) = ...
1302  f1(x, y) = ... g(x, y) ...
1303  f2(x, y) = ... g(x, y) ...
1304  g.in(f1).compute_at(f1, y).vectorize(x, 8);
1305  g.in(f2).compute_at(f2, x).unroll(x);
1306  \endcode
1307  *
1308  * You can also use it to stage loads from this Func via some
1309  * intermediate buffer (perhaps on the stack as in
1310  * test/performance/block_transpose.cpp, or in shared GPU memory
1311  * as in test/performance/wrap.cpp). In this we compute the
1312  * wrapper at tiles of the consuming Funcs like so:
1313  \code
1314  g.compute_root()...
1315  g.in(f).compute_at(f, tiles)...
1316  \endcode
1317  *
1318  * Func::in() can also be used to compute pieces of a Func into a
1319  * smaller scratch buffer (perhaps on the GPU) and then copy them
1320  * into a larger output buffer one tile at a time. See
1321  * apps/interpolate/interpolate.cpp for an example of this. In
1322  * this case we compute the Func at tiles of its own wrapper:
1323  \code
1324  f.in(g).compute_root().gpu_tile(...)...
1325  f.compute_at(f.in(g), tiles)...
1326  \endcode
1327  *
1328  * A similar use of Func::in() wrapping Funcs with multiple update
1329  * stages in a pure wrapper. The following code:
1330  \code
1331  f(x, y) = x + y;
1332  f(x, y) += 5;
1333  g(x, y) = f(x, y);
1334  f.compute_root();
1335  \endcode
1336  *
1337  * Is equivalent to:
1338  \code
1339  for y:
1340  for x:
1341  f(x, y) = x + y;
1342  for y:
1343  for x:
1344  f(x, y) += 5
1345  for y:
1346  for x:
1347  g(x, y) = f(x, y)
1348  \endcode
1349  * using Func::in(), we can write:
1350  \code
1351  f(x, y) = x + y;
1352  f(x, y) += 5;
1353  g(x, y) = f(x, y);
1354  f.in(g).compute_root();
1355  \endcode
1356  * which instead produces:
1357  \code
1358  for y:
1359  for x:
1360  f(x, y) = x + y;
1361  f(x, y) += 5
1362  f_wrap(x, y) = f(x, y)
1363  for y:
1364  for x:
1365  g(x, y) = f_wrap(x, y)
1366  \endcode
1367  */
1368  Func in(const Func &f);
1369 
1370  /** Create and return an identity wrapper shared by all the Funcs in
1371  * 'fs'. If any of the Funcs in 'fs' already have a custom wrapper,
1372  * this will throw an error. */
1373  Func in(const std::vector<Func> &fs);
1374 
1375  /** Create and return a global identity wrapper, which wraps all calls to
1376  * this Func by any other Func. If a global wrapper already exists,
1377  * returns it. The global identity wrapper is only used by callers for
1378  * which no custom wrapper has been specified.
1379  */
1381 
1382  /** Similar to \ref Func::in; however, instead of replacing the call to
1383  * this Func with an identity Func that refers to it, this replaces the
1384  * call with a clone of this Func.
1385  *
1386  * For example, f.clone_in(g) would rewrite a pipeline like this:
1387  \code
1388  f(x, y) = x + y;
1389  g(x, y) = f(x, y) + 2;
1390  h(x, y) = f(x, y) - 3;
1391  \endcode
1392  * into a pipeline like this:
1393  \code
1394  f(x, y) = x + y;
1395  f_clone(x, y) = x + y;
1396  g(x, y) = f_clone(x, y) + 2;
1397  h(x, y) = f(x, y) - 3;
1398  \endcode
1399  *
1400  */
1401  //@{
1402  Func clone_in(const Func &f);
1403  Func clone_in(const std::vector<Func> &fs);
1404  //@}
1405 
1406  /** Declare that this function should be implemented by a call to
1407  * halide_buffer_copy with the given target device API. Asserts
1408  * that the Func has a pure definition which is a simple call to a
1409  * single input, and no update definitions. The wrapper Funcs
1410  * returned by in() are suitable candidates. Consumes all pure
1411  * variables, and rewrites the Func to have an extern definition
1412  * that calls halide_buffer_copy. */
1414 
1415  /** Declare that this function should be implemented by a call to
1416  * halide_buffer_copy with a NULL target device API. Equivalent to
1417  * copy_to_device(DeviceAPI::Host). Asserts that the Func has a
1418  * pure definition which is a simple call to a single input, and
1419  * no update definitions. The wrapper Funcs returned by in() are
1420  * suitable candidates. Consumes all pure variables, and rewrites
1421  * the Func to have an extern definition that calls
1422  * halide_buffer_copy.
1423  *
1424  * Note that if the source Func is already valid in host memory,
1425  * this compiles to code that does the minimum number of calls to
1426  * memcpy.
1427  */
1429 
1430  /** Split a dimension into inner and outer subdimensions with the
1431  * given names, where the inner dimension iterates from 0 to
1432  * factor-1. The inner and outer subdimensions can then be dealt
1433  * with using the other scheduling calls. It's ok to reuse the old
1434  * variable name as either the inner or outer variable. The final
1435  * argument specifies how the tail should be handled if the split
1436  * factor does not provably divide the extent. */
1437  Func &split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1438 
1439  /** Join two dimensions into a single fused dimenion. The fused
1440  * dimension covers the product of the extents of the inner and
1441  * outer dimensions given. */
1442  Func &fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused);
1443 
1444  /** Mark a dimension to be traversed serially. This is the default. */
1445  Func &serial(const VarOrRVar &var);
1446 
1447  /** Mark a dimension to be traversed in parallel */
1448  Func &parallel(const VarOrRVar &var);
1449 
1450  /** Split a dimension by the given task_size, and the parallelize the
1451  * outer dimension. This creates parallel tasks that have size
1452  * task_size. After this call, var refers to the outer dimension of
1453  * the split. The inner dimension has a new anonymous name. If you
1454  * wish to mutate it, or schedule with respect to it, do the split
1455  * manually. */
1456  Func &parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail = TailStrategy::Auto);
1457 
1458  /** Mark a dimension to be computed all-at-once as a single
1459  * vector. The dimension should have constant extent -
1460  * e.g. because it is the inner dimension following a split by a
1461  * constant factor. For most uses of vectorize you want the two
1462  * argument form. The variable to be vectorized should be the
1463  * innermost one. */
1464  Func &vectorize(const VarOrRVar &var);
1465 
1466  /** Mark a dimension to be completely unrolled. The dimension
1467  * should have constant extent - e.g. because it is the inner
1468  * dimension following a split by a constant factor. For most uses
1469  * of unroll you want the two-argument form. */
1470  Func &unroll(const VarOrRVar &var);
1471 
1472  /** Split a dimension by the given factor, then vectorize the
1473  * inner dimension. This is how you vectorize a loop of unknown
1474  * size. The variable to be vectorized should be the innermost
1475  * one. After this call, var refers to the outer dimension of the
1476  * split. 'factor' must be an integer. */
1477  Func &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1478 
1479  /** Split a dimension by the given factor, then unroll the inner
1480  * dimension. This is how you unroll a loop of unknown size by
1481  * some constant factor. After this call, var refers to the outer
1482  * dimension of the split. 'factor' must be an integer. */
1483  Func &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1484 
1485  /** Statically declare that the range over which a function should
1486  * be evaluated is given by the second and third arguments. This
1487  * can let Halide perform some optimizations. E.g. if you know
1488  * there are going to be 4 color channels, you can completely
1489  * vectorize the color channel dimension without the overhead of
1490  * splitting it up. If bounds inference decides that it requires
1491  * more of this function than the bounds you have stated, a
1492  * runtime error will occur when you try to run your pipeline. */
1493  Func &bound(const Var &var, Expr min, Expr extent);
1494 
1495  /** Statically declare the range over which the function will be
1496  * evaluated in the general case. This provides a basis for the auto
1497  * scheduler to make trade-offs and scheduling decisions. The auto
1498  * generated schedules might break when the sizes of the dimensions are
1499  * very different from the estimates specified. These estimates are used
1500  * only by the auto scheduler if the function is a pipeline output. */
1501  Func &set_estimate(const Var &var, const Expr &min, const Expr &extent);
1502 
1503  /** Set (min, extent) estimates for all dimensions in the Func
1504  * at once; this is equivalent to calling `set_estimate(args()[n], min, extent)`
1505  * repeatedly, but slightly terser. The size of the estimates vector
1506  * must match the dimensionality of the Func. */
1507  Func &set_estimates(const Region &estimates);
1508 
1509  /** Expand the region computed so that the min coordinates is
1510  * congruent to 'remainder' modulo 'modulus', and the extent is a
1511  * multiple of 'modulus'. For example, f.align_bounds(x, 2) forces
1512  * the min and extent realized to be even, and calling
1513  * f.align_bounds(x, 2, 1) forces the min to be odd and the extent
1514  * to be even. The region computed always contains the region that
1515  * would have been computed without this directive, so no
1516  * assertions are injected.
1517  */
1518  Func &align_bounds(const Var &var, Expr modulus, Expr remainder = 0);
1519 
1520  /** Expand the region computed so that the extent is a
1521  * multiple of 'modulus'. For example, f.align_extent(x, 2) forces
1522  * the extent realized to be even. The region computed always contains the
1523  * region that would have been computed without this directive, so no
1524  * assertions are injected. (This is essentially equivalent to align_bounds(),
1525  * but always leaving the min untouched.)
1526  */
1527  Func &align_extent(const Var &var, Expr modulus);
1528 
1529  /** Bound the extent of a Func's realization, but not its
1530  * min. This means the dimension can be unrolled or vectorized
1531  * even when its min is not fixed (for example because it is
1532  * compute_at tiles of another Func). This can also be useful for
1533  * forcing a function's allocation to be a fixed size, which often
1534  * means it can go on the stack. */
1535  Func &bound_extent(const Var &var, Expr extent);
1536 
1537  /** Split two dimensions at once by the given factors, and then
1538  * reorder the resulting dimensions to be xi, yi, xo, yo from
1539  * innermost outwards. This gives a tiled traversal. */
1540  Func &tile(const VarOrRVar &x, const VarOrRVar &y,
1541  const VarOrRVar &xo, const VarOrRVar &yo,
1542  const VarOrRVar &xi, const VarOrRVar &yi,
1543  const Expr &xfactor, const Expr &yfactor,
1545 
1546  /** A shorter form of tile, which reuses the old variable names as
1547  * the new outer dimensions */
1548  Func &tile(const VarOrRVar &x, const VarOrRVar &y,
1549  const VarOrRVar &xi, const VarOrRVar &yi,
1550  const Expr &xfactor, const Expr &yfactor,
1552 
1553  /** A more general form of tile, which defines tiles of any dimensionality. */
1554  Func &tile(const std::vector<VarOrRVar> &previous,
1555  const std::vector<VarOrRVar> &outers,
1556  const std::vector<VarOrRVar> &inners,
1557  const std::vector<Expr> &factors,
1558  const std::vector<TailStrategy> &tails);
1559 
1560  /** The generalized tile, with a single tail strategy to apply to all vars. */
1561  Func &tile(const std::vector<VarOrRVar> &previous,
1562  const std::vector<VarOrRVar> &outers,
1563  const std::vector<VarOrRVar> &inners,
1564  const std::vector<Expr> &factors,
1566 
1567  /** Generalized tiling, reusing the previous names as the outer names. */
1568  Func &tile(const std::vector<VarOrRVar> &previous,
1569  const std::vector<VarOrRVar> &inners,
1570  const std::vector<Expr> &factors,
1572 
1573  /** Reorder variables to have the given nesting order, from
1574  * innermost out */
1575  Func &reorder(const std::vector<VarOrRVar> &vars);
1576 
1577  template<typename... Args>
1578  HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Func &>::type
1579  reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args) {
1580  std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
1581  return reorder(collected_args);
1582  }
1583 
1584  /** Rename a dimension. Equivalent to split with a inner size of one. */
1585  Func &rename(const VarOrRVar &old_name, const VarOrRVar &new_name);
1586 
1587  /** Specify that race conditions are permitted for this Func,
1588  * which enables parallelizing over RVars even when Halide cannot
1589  * prove that it is safe to do so. Use this with great caution,
1590  * and only if you can prove to yourself that this is safe, as it
1591  * may result in a non-deterministic routine that returns
1592  * different values at different times or on different machines. */
1594 
1595  /** Issue atomic updates for this Func. This allows parallelization
1596  * on associative RVars. The function throws a compile error when
1597  * Halide fails to prove associativity. Use override_associativity_test
1598  * to disable the associativity test if you believe the function is
1599  * associative or the order of reduction variable execution does not
1600  * matter.
1601  * Halide compiles this into hardware atomic operations whenever possible,
1602  * and falls back to a mutex lock per storage element if it is impossible
1603  * to atomically update.
1604  * There are three possible outcomes of the compiled code:
1605  * atomic add, compare-and-swap loop, and mutex lock.
1606  * For example:
1607  *
1608  * hist(x) = 0;
1609  * hist(im(r)) += 1;
1610  * hist.compute_root();
1611  * hist.update().atomic().parallel();
1612  *
1613  * will be compiled to atomic add operations.
1614  *
1615  * hist(x) = 0;
1616  * hist(im(r)) = min(hist(im(r)) + 1, 100);
1617  * hist.compute_root();
1618  * hist.update().atomic().parallel();
1619  *
1620  * will be compiled to compare-and-swap loops.
1621  *
1622  * arg_max() = {0, im(0)};
1623  * Expr old_index = arg_max()[0];
1624  * Expr old_max = arg_max()[1];
1625  * Expr new_index = select(old_max < im(r), r, old_index);
1626  * Expr new_max = max(im(r), old_max);
1627  * arg_max() = {new_index, new_max};
1628  * arg_max.compute_root();
1629  * arg_max.update().atomic().parallel();
1630  *
1631  * will be compiled to updates guarded by a mutex lock,
1632  * since it is impossible to atomically update two different locations.
1633  *
1634  * Currently the atomic operation is supported by x86, CUDA, and OpenCL backends.
1635  * Compiling to other backends results in a compile error.
1636  * If an operation is compiled into a mutex lock, and is vectorized or is
1637  * compiled to CUDA or OpenCL, it also results in a compile error,
1638  * since per-element mutex lock on vectorized operation leads to a
1639  * deadlock.
1640  * Vectorization of predicated RVars (through rdom.where()) on CPU
1641  * is also unsupported yet (see https://github.com/halide/Halide/issues/4298).
1642  * 8-bit and 16-bit atomics on GPU are also not supported. */
1643  Func &atomic(bool override_associativity_test = false);
1644 
1645  /** Specialize a Func. This creates a special-case version of the
1646  * Func where the given condition is true. The most effective
1647  * conditions are those of the form param == value, and boolean
1648  * Params. Consider a simple example:
1649  \code
1650  f(x) = x + select(cond, 0, 1);
1651  f.compute_root();
1652  \endcode
1653  * This is equivalent to:
1654  \code
1655  for (int x = 0; x < width; x++) {
1656  f[x] = x + (cond ? 0 : 1);
1657  }
1658  \endcode
1659  * Adding the scheduling directive:
1660  \code
1661  f.specialize(cond)
1662  \endcode
1663  * makes it equivalent to:
1664  \code
1665  if (cond) {
1666  for (int x = 0; x < width; x++) {
1667  f[x] = x;
1668  }
1669  } else {
1670  for (int x = 0; x < width; x++) {
1671  f[x] = x + 1;
1672  }
1673  }
1674  \endcode
1675  * Note that the inner loops have been simplified. In the first
1676  * path Halide knows that cond is true, and in the second path
1677  * Halide knows that it is false.
1678  *
1679  * The specialized version gets its own schedule, which inherits
1680  * every directive made about the parent Func's schedule so far
1681  * except for its specializations. This method returns a handle to
1682  * the new schedule. If you wish to retrieve the specialized
1683  * sub-schedule again later, you can call this method with the
1684  * same condition. Consider the following example of scheduling
1685  * the specialized version:
1686  *
1687  \code
1688  f(x) = x;
1689  f.compute_root();
1690  f.specialize(width > 1).unroll(x, 2);
1691  \endcode
1692  * Assuming for simplicity that width is even, this is equivalent to:
1693  \code
1694  if (width > 1) {
1695  for (int x = 0; x < width/2; x++) {
1696  f[2*x] = 2*x;
1697  f[2*x + 1] = 2*x + 1;
1698  }
1699  } else {
1700  for (int x = 0; x < width/2; x++) {
1701  f[x] = x;
1702  }
1703  }
1704  \endcode
1705  * For this case, it may be better to schedule the un-specialized
1706  * case instead:
1707  \code
1708  f(x) = x;
1709  f.compute_root();
1710  f.specialize(width == 1); // Creates a copy of the schedule so far.
1711  f.unroll(x, 2); // Only applies to the unspecialized case.
1712  \endcode
1713  * This is equivalent to:
1714  \code
1715  if (width == 1) {
1716  f[0] = 0;
1717  } else {
1718  for (int x = 0; x < width/2; x++) {
1719  f[2*x] = 2*x;
1720  f[2*x + 1] = 2*x + 1;
1721  }
1722  }
1723  \endcode
1724  * This can be a good way to write a pipeline that splits,
1725  * vectorizes, or tiles, but can still handle small inputs.
1726  *
1727  * If a Func has several specializations, the first matching one
1728  * will be used, so the order in which you define specializations
1729  * is significant. For example:
1730  *
1731  \code
1732  f(x) = x + select(cond1, a, b) - select(cond2, c, d);
1733  f.specialize(cond1);
1734  f.specialize(cond2);
1735  \endcode
1736  * is equivalent to:
1737  \code
1738  if (cond1) {
1739  for (int x = 0; x < width; x++) {
1740  f[x] = x + a - (cond2 ? c : d);
1741  }
1742  } else if (cond2) {
1743  for (int x = 0; x < width; x++) {
1744  f[x] = x + b - c;
1745  }
1746  } else {
1747  for (int x = 0; x < width; x++) {
1748  f[x] = x + b - d;
1749  }
1750  }
1751  \endcode
1752  *
1753  * Specializations may in turn be specialized, which creates a
1754  * nested if statement in the generated code.
1755  *
1756  \code
1757  f(x) = x + select(cond1, a, b) - select(cond2, c, d);
1758  f.specialize(cond1).specialize(cond2);
1759  \endcode
1760  * This is equivalent to:
1761  \code
1762  if (cond1) {
1763  if (cond2) {
1764  for (int x = 0; x < width; x++) {
1765  f[x] = x + a - c;
1766  }
1767  } else {
1768  for (int x = 0; x < width; x++) {
1769  f[x] = x + a - d;
1770  }
1771  }
1772  } else {
1773  for (int x = 0; x < width; x++) {
1774  f[x] = x + b - (cond2 ? c : d);
1775  }
1776  }
1777  \endcode
1778  * To create a 4-way if statement that simplifies away all of the
1779  * ternary operators above, you could say:
1780  \code
1781  f.specialize(cond1).specialize(cond2);
1782  f.specialize(cond2);
1783  \endcode
1784  * or
1785  \code
1786  f.specialize(cond1 && cond2);
1787  f.specialize(cond1);
1788  f.specialize(cond2);
1789  \endcode
1790  *
1791  * Any prior Func which is compute_at some variable of this Func
1792  * gets separately included in all paths of the generated if
1793  * statement. The Var in the compute_at call to must exist in all
1794  * paths, but it may have been generated via a different path of
1795  * splits, fuses, and renames. This can be used somewhat
1796  * creatively. Consider the following code:
1797  \code
1798  g(x, y) = 8*x;
1799  f(x, y) = g(x, y) + 1;
1800  f.compute_root().specialize(cond);
1801  Var g_loop;
1802  f.specialize(cond).rename(y, g_loop);
1803  f.rename(x, g_loop);
1804  g.compute_at(f, g_loop);
1805  \endcode
1806  * When cond is true, this is equivalent to g.compute_at(f,y).
1807  * When it is false, this is equivalent to g.compute_at(f,x).
1808  */
1809  Stage specialize(const Expr &condition);
1810 
1811  /** Add a specialization to a Func that always terminates execution
1812  * with a call to halide_error(). By itself, this is of limited use,
1813  * but can be useful to terminate chains of specialize() calls where
1814  * no "default" case is expected (thus avoiding unnecessary code generation).
1815  *
1816  * For instance, say we want to optimize a pipeline to process images
1817  * in planar and interleaved format; we might typically do something like:
1818  \code
1819  ImageParam im(UInt(8), 3);
1820  Func f = do_something_with(im);
1821  f.specialize(im.dim(0).stride() == 1).vectorize(x, 8); // planar
1822  f.specialize(im.dim(2).stride() == 1).reorder(c, x, y).vectorize(c); // interleaved
1823  \endcode
1824  * This code will vectorize along rows for the planar case, and across pixel
1825  * components for the interleaved case... but there is an implicit "else"
1826  * for the unhandled cases, which generates unoptimized code. If we never
1827  * anticipate passing any other sort of images to this, we code streamline
1828  * our code by adding specialize_fail():
1829  \code
1830  ImageParam im(UInt(8), 3);
1831  Func f = do_something(im);
1832  f.specialize(im.dim(0).stride() == 1).vectorize(x, 8); // planar
1833  f.specialize(im.dim(2).stride() == 1).reorder(c, x, y).vectorize(c); // interleaved
1834  f.specialize_fail("Unhandled image format");
1835  \endcode
1836  * Conceptually, this produces codes like:
1837  \code
1838  if (im.dim(0).stride() == 1) {
1839  do_something_planar();
1840  } else if (im.dim(2).stride() == 1) {
1841  do_something_interleaved();
1842  } else {
1843  halide_error("Unhandled image format");
1844  }
1845  \endcode
1846  *
1847  * Note that calling specialize_fail() terminates the specialization chain
1848  * for a given Func; you cannot create new specializations for the Func
1849  * afterwards (though you can retrieve handles to previous specializations).
1850  */
1851  void specialize_fail(const std::string &message);
1852 
1853  /** Tell Halide that the following dimensions correspond to GPU
1854  * thread indices. This is useful if you compute a producer
1855  * function within the block indices of a consumer function, and
1856  * want to control how that function's dimensions map to GPU
1857  * threads. If the selected target is not an appropriate GPU, this
1858  * just marks those dimensions as parallel. */
1859  // @{
1860  Func &gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
1861  Func &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1862  Func &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1863  // @}
1864 
1865  /** The given dimension corresponds to the lanes in a GPU
1866  * warp. GPU warp lanes are distinguished from GPU threads by the
1867  * fact that all warp lanes run together in lockstep, which
1868  * permits lightweight communication of data from one lane to
1869  * another. */
1870  Func &gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
1871 
1872  /** Tell Halide to run this stage using a single gpu thread and
1873  * block. This is not an efficient use of your GPU, but it can be
1874  * useful to avoid copy-back for intermediate update stages that
1875  * touch a very small part of your Func. */
1877 
1878  /** Tell Halide that the following dimensions correspond to GPU
1879  * block indices. This is useful for scheduling stages that will
1880  * run serially within each GPU block. If the selected target is
1881  * not ptx, this just marks those dimensions as parallel. */
1882  // @{
1884  Func &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1885  Func &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1886  // @}
1887 
1888  /** Tell Halide that the following dimensions correspond to GPU
1889  * block indices and thread indices. If the selected target is not
1890  * ptx, these just mark the given dimensions as parallel. The
1891  * dimensions are consumed by this call, so do all other
1892  * unrolling, reordering, etc first. */
1893  // @{
1894  Func &gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
1895  Func &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y,
1896  const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1897  Func &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z,
1898  const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1899  // @}
1900 
1901  /** Short-hand for tiling a domain and mapping the tile indices
1902  * to GPU block indices and the coordinates within each tile to
1903  * GPU thread indices. Consumes the variables given, so do all
1904  * other scheduling first. */
1905  // @{
1906  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size,
1908  DeviceAPI device_api = DeviceAPI::Default_GPU);
1909 
1910  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size,
1912  DeviceAPI device_api = DeviceAPI::Default_GPU);
1913  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
1914  const VarOrRVar &bx, const VarOrRVar &by,
1915  const VarOrRVar &tx, const VarOrRVar &ty,
1916  const Expr &x_size, const Expr &y_size,
1918  DeviceAPI device_api = DeviceAPI::Default_GPU);
1919 
1920  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
1921  const VarOrRVar &tx, const VarOrRVar &ty,
1922  const Expr &x_size, const Expr &y_size,
1924  DeviceAPI device_api = DeviceAPI::Default_GPU);
1925 
1926  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
1927  const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz,
1928  const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
1929  const Expr &x_size, const Expr &y_size, const Expr &z_size,
1931  DeviceAPI device_api = DeviceAPI::Default_GPU);
1932  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
1933  const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
1934  const Expr &x_size, const Expr &y_size, const Expr &z_size,
1936  DeviceAPI device_api = DeviceAPI::Default_GPU);
1937  // @}
1938 
1939  /** Schedule for execution on Hexagon. When a loop is marked with
1940  * Hexagon, that loop is executed on a Hexagon DSP. */
1942 
1943  /** Prefetch data written to or read from a Func or an ImageParam by a
1944  * subsequent loop iteration, at an optionally specified iteration offset.
1945  * 'var' specifies at which loop level the prefetch calls should be inserted.
1946  * The final argument specifies how prefetch of region outside bounds
1947  * should be handled.
1948  *
1949  * For example, consider this pipeline:
1950  \code
1951  Func f, g;
1952  Var x, y;
1953  f(x, y) = x + y;
1954  g(x, y) = 2 * f(x, y);
1955  \endcode
1956  *
1957  * The following schedule:
1958  \code
1959  f.compute_root();
1960  g.prefetch(f, x, 2, PrefetchBoundStrategy::NonFaulting);
1961  \endcode
1962  *
1963  * will inject prefetch call at the innermost loop of 'g' and generate
1964  * the following loop nest:
1965  * for y = ...
1966  * for x = ...
1967  * f(x, y) = x + y
1968  * for y = ..
1969  * for x = ...
1970  * prefetch(&f[x + 2, y], 1, 16);
1971  * g(x, y) = 2 * f(x, y)
1972  */
1973  // @{
1974  HALIDE_ATTRIBUTE_DEPRECATED("Call prefetch() with the two-var form instead.")
1975  Func &prefetch(const Func &f, const VarOrRVar &var, int offset = 1,
1976  PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
1977  return prefetch(f, var, var, offset, strategy);
1978  }
1979  HALIDE_ATTRIBUTE_DEPRECATED("Call prefetch() with the two-var form instead.")
1980  Func &prefetch(const Internal::Parameter &param, const VarOrRVar &var, int offset = 1,
1981  PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
1982  return prefetch(param, var, var, offset, strategy);
1983  }
1984  template<typename T>
1985  HALIDE_ATTRIBUTE_DEPRECATED("Call prefetch() with the two-var form instead.")
1986  Func &prefetch(const T &image, VarOrRVar var, int offset = 1,
1987  PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
1988  return prefetch<T>(image, var, var, offset, strategy);
1989  }
1990  // @}
1991 
1992  /** prefetch() is a more fine-grained version of prefetch(), which allows
1993  * specification of different vars for the location of the prefetch() instruction
1994  * vs. the location that is being prefetched:
1995  *
1996  * - the first var specified, 'at', indicates the loop in which the prefetch will be placed
1997  * - the second var specified, 'from', determines the var used to find the bounds to prefetch
1998  * (in conjunction with 'offset')
1999  *
2000  * If 'at' and 'from' are distinct vars, then 'from' must be at a nesting level outside 'at.'
2001  * Note that the value for 'offset' applies only to 'from', not 'at'.
2002  *
2003  * For example, consider this pipeline:
2004  \code
2005  Func f, g;
2006  Var x, y, z;
2007  f(x, y) = x + y;
2008  g(x, y) = 2 * f(x, y);
2009  h(x, y) = 3 * f(x, y);
2010  \endcode
2011  *
2012  * The following schedule:
2013  \code
2014  f.compute_root();
2015  g.prefetch(f, x, x, 2, PrefetchBoundStrategy::NonFaulting);
2016  h.prefetch(f, x, y, 2, PrefetchBoundStrategy::NonFaulting);
2017  \endcode
2018  *
2019  * will inject prefetch call at the innermost loop of 'g' and 'h' and generate
2020  * the following loop nest:
2021  \code
2022  for y = ...
2023  for x = ...
2024  f(x, y) = x + y
2025  for y = ..
2026  for x = ...
2027  prefetch(&f[x + 2, y], 1, 16);
2028  g(x, y) = 2 * f(x, y)
2029  for y = ..
2030  for x = ...
2031  prefetch(&f[x, y + 2], 1, 16);
2032  h(x, y) = 3 * f(x, y)
2033  \endcode
2034  *
2035  * Note that the 'from' nesting level need not be adjacent to 'at':
2036  \code
2037  Func f, g;
2038  Var x, y, z, w;
2039  f(x, y, z, w) = x + y + z + w;
2040  g(x, y, z, w) = 2 * f(x, y, z, w);
2041  \endcode
2042  *
2043  * The following schedule:
2044  \code
2045  f.compute_root();
2046  g.prefetch(f, y, w, 2, PrefetchBoundStrategy::NonFaulting);
2047  \endcode
2048  *
2049  * will produce code that prefetches a tile of data:
2050  \code
2051  for w = ...
2052  for z = ...
2053  for y = ...
2054  for x = ...
2055  f(x, y, z, w) = x + y + z + w
2056  for w = ...
2057  for z = ...
2058  for y = ...
2059  for x0 = ...
2060  prefetch(&f[x0, y, z, w + 2], 1, 16);
2061  for x = ...
2062  g(x, y, z, w) = 2 * f(x, y, z, w)
2063  \endcode
2064  *
2065  * Note that calling prefetch() with the same var for both 'at' and 'from'
2066  * is equivalent to calling prefetch() with that var.
2067  */
2068  // @{
2069  Func &prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
2071  Func &prefetch(const Internal::Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
2073  template<typename T>
2074  Func &prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
2076  return prefetch(image.parameter(), at, from, std::move(offset), strategy);
2077  }
2078  // @}
2079 
2080  /** Specify how the storage for the function is laid out. These
2081  * calls let you specify the nesting order of the dimensions. For
2082  * example, foo.reorder_storage(y, x) tells Halide to use
2083  * column-major storage for any realizations of foo, without
2084  * changing how you refer to foo in the code. You may want to do
2085  * this if you intend to vectorize across y. When representing
2086  * color images, foo.reorder_storage(c, x, y) specifies packed
2087  * storage (red, green, and blue values adjacent in memory), and
2088  * foo.reorder_storage(x, y, c) specifies planar storage (entire
2089  * red, green, and blue images one after the other in memory).
2090  *
2091  * If you leave out some dimensions, those remain in the same
2092  * positions in the nesting order while the specified variables
2093  * are reordered around them. */
2094  // @{
2095  Func &reorder_storage(const std::vector<Var> &dims);
2096 
2097  Func &reorder_storage(const Var &x, const Var &y);
2098  template<typename... Args>
2099  HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<Var, Args...>::value, Func &>::type
2100  reorder_storage(const Var &x, const Var &y, Args &&...args) {
2101  std::vector<Var> collected_args{x, y, std::forward<Args>(args)...};
2102  return reorder_storage(collected_args);
2103  }
2104  // @}
2105 
2106  /** Pad the storage extent of a particular dimension of
2107  * realizations of this function up to be a multiple of the
2108  * specified alignment. This guarantees that the strides for the
2109  * dimensions stored outside of dim will be multiples of the
2110  * specified alignment, where the strides and alignment are
2111  * measured in numbers of elements.
2112  *
2113  * For example, to guarantee that a function foo(x, y, c)
2114  * representing an image has scanlines starting on offsets
2115  * aligned to multiples of 16, use foo.align_storage(x, 16). */
2116  Func &align_storage(const Var &dim, const Expr &alignment);
2117 
2118  /** Store realizations of this function in a circular buffer of a
2119  * given extent. This is more efficient when the extent of the
2120  * circular buffer is a power of 2. If the fold factor is too
2121  * small, or the dimension is not accessed monotonically, the
2122  * pipeline will generate an error at runtime.
2123  *
2124  * The fold_forward option indicates that the new values of the
2125  * producer are accessed by the consumer in a monotonically
2126  * increasing order. Folding storage of producers is also
2127  * supported if the new values are accessed in a monotonically
2128  * decreasing order by setting fold_forward to false.
2129  *
2130  * For example, consider the pipeline:
2131  \code
2132  Func f, g;
2133  Var x, y;
2134  g(x, y) = x*y;
2135  f(x, y) = g(x, y) + g(x, y+1);
2136  \endcode
2137  *
2138  * If we schedule f like so:
2139  *
2140  \code
2141  g.compute_at(f, y).store_root().fold_storage(y, 2);
2142  \endcode
2143  *
2144  * Then g will be computed at each row of f and stored in a buffer
2145  * with an extent in y of 2, alternately storing each computed row
2146  * of g in row y=0 or y=1.
2147  */
2148  Func &fold_storage(const Var &dim, const Expr &extent, bool fold_forward = true);
2149 
2150  /** Compute this function as needed for each unique value of the
2151  * given var for the given calling function f.
2152  *
2153  * For example, consider the simple pipeline:
2154  \code
2155  Func f, g;
2156  Var x, y;
2157  g(x, y) = x*y;
2158  f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
2159  \endcode
2160  *
2161  * If we schedule f like so:
2162  *
2163  \code
2164  g.compute_at(f, x);
2165  \endcode
2166  *
2167  * Then the C code equivalent to this pipeline will look like this
2168  *
2169  \code
2170 
2171  int f[height][width];
2172  for (int y = 0; y < height; y++) {
2173  for (int x = 0; x < width; x++) {
2174  int g[2][2];
2175  g[0][0] = x*y;
2176  g[0][1] = (x+1)*y;
2177  g[1][0] = x*(y+1);
2178  g[1][1] = (x+1)*(y+1);
2179  f[y][x] = g[0][0] + g[1][0] + g[0][1] + g[1][1];
2180  }
2181  }
2182 
2183  \endcode
2184  *
2185  * The allocation and computation of g is within f's loop over x,
2186  * and enough of g is computed to satisfy all that f will need for
2187  * that iteration. This has excellent locality - values of g are
2188  * used as soon as they are computed, but it does redundant
2189  * work. Each value of g ends up getting computed four times. If
2190  * we instead schedule f like so:
2191  *
2192  \code
2193  g.compute_at(f, y);
2194  \endcode
2195  *
2196  * The equivalent C code is:
2197  *
2198  \code
2199  int f[height][width];
2200  for (int y = 0; y < height; y++) {
2201  int g[2][width+1];
2202  for (int x = 0; x < width; x++) {
2203  g[0][x] = x*y;
2204  g[1][x] = x*(y+1);
2205  }
2206  for (int x = 0; x < width; x++) {
2207  f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2208  }
2209  }
2210  \endcode
2211  *
2212  * The allocation and computation of g is within f's loop over y,
2213  * and enough of g is computed to satisfy all that f will need for
2214  * that iteration. This does less redundant work (each point in g
2215  * ends up being evaluated twice), but the locality is not quite
2216  * as good, and we have to allocate more temporary memory to store
2217  * g.
2218  */
2219  Func &compute_at(const Func &f, const Var &var);
2220 
2221  /** Schedule a function to be computed within the iteration over
2222  * some dimension of an update domain. Produces equivalent code
2223  * to the version of compute_at that takes a Var. */
2224  Func &compute_at(const Func &f, const RVar &var);
2225 
2226  /** Schedule a function to be computed within the iteration over
2227  * a given LoopLevel. */
2228  Func &compute_at(LoopLevel loop_level);
2229 
2230  /** Schedule the iteration over the initial definition of this function
2231  * to be fused with another stage 's' from outermost loop to a
2232  * given LoopLevel. */
2233  // @{
2234  Func &compute_with(const Stage &s, const VarOrRVar &var, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
2236  Func &compute_with(LoopLevel loop_level, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
2238 
2239  /** Compute all of this function once ahead of time. Reusing
2240  * the example in \ref Func::compute_at :
2241  *
2242  \code
2243  Func f, g;
2244  Var x, y;
2245  g(x, y) = x*y;
2246  f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
2247 
2248  g.compute_root();
2249  \endcode
2250  *
2251  * is equivalent to
2252  *
2253  \code
2254  int f[height][width];
2255  int g[height+1][width+1];
2256  for (int y = 0; y < height+1; y++) {
2257  for (int x = 0; x < width+1; x++) {
2258  g[y][x] = x*y;
2259  }
2260  }
2261  for (int y = 0; y < height; y++) {
2262  for (int x = 0; x < width; x++) {
2263  f[y][x] = g[y][x] + g[y+1][x] + g[y][x+1] + g[y+1][x+1];
2264  }
2265  }
2266  \endcode
2267  *
2268  * g is computed once ahead of time, and enough is computed to
2269  * satisfy all uses of it. This does no redundant work (each point
2270  * in g is evaluated once), but has poor locality (values of g are
2271  * probably not still in cache when they are used by f), and
2272  * allocates lots of temporary memory to store g.
2273  */
2275 
2276  /** Use the halide_memoization_cache_... interface to store a
2277  * computed version of this function across invocations of the
2278  * Func.
2279  *
2280  * If an eviction_key is provided, it must be constructed with
2281  * Expr of integer or handle type. The key Expr will be promoted
2282  * to a uint64_t and can be used with halide_memoization_cache_evict
2283  * to remove memoized entries using this eviction key from the
2284  * cache. Memoized computations that do not provide an eviction
2285  * key will never be evicted by this mechanism.
2286  */
2287  Func &memoize(const EvictionKey &eviction_key = EvictionKey());
2288 
2289  /** Produce this Func asynchronously in a separate
2290  * thread. Consumers will be run by the task system when the
2291  * production is complete. If this Func's store level is different
2292  * to its compute level, consumers will be run concurrently,
2293  * blocking as necessary to prevent reading ahead of what the
2294  * producer has computed. If storage is folded, then the producer
2295  * will additionally not be permitted to run too far ahead of the
2296  * consumer, to avoid clobbering data that has not yet been
2297  * used.
2298  *
2299  * Take special care when combining this with custom thread pool
2300  * implementations, as avoiding deadlock with producer-consumer
2301  * parallelism requires a much more sophisticated parallel runtime
2302  * than with data parallelism alone. It is strongly recommended
2303  * you just use Halide's default thread pool, which guarantees no
2304  * deadlock and a bound on the number of threads launched.
2305  */
2307 
2308  /** Bound the extent of a Func's storage, but not extent of its
2309  * compute. This can be useful for forcing a function's allocation
2310  * to be a fixed size, which often means it can go on the stack.
2311  * If bounds inference decides that it requires more storage for
2312  * this function than the allocation size you have stated, a runtime
2313  * error will occur when you try to run the pipeline. */
2314  Func &bound_storage(const Var &dim, const Expr &bound);
2315 
2316  /** Allocate storage for this function within f's loop over
2317  * var. Scheduling storage is optional, and can be used to
2318  * separate the loop level at which storage occurs from the loop
2319  * level at which computation occurs to trade off between locality
2320  * and redundant work. This can open the door for two types of
2321  * optimization.
2322  *
2323  * Consider again the pipeline from \ref Func::compute_at :
2324  \code
2325  Func f, g;
2326  Var x, y;
2327  g(x, y) = x*y;
2328  f(x, y) = g(x, y) + g(x+1, y) + g(x, y+1) + g(x+1, y+1);
2329  \endcode
2330  *
2331  * If we schedule it like so:
2332  *
2333  \code
2334  g.compute_at(f, x).store_at(f, y);
2335  \endcode
2336  *
2337  * Then the computation of g takes place within the loop over x,
2338  * but the storage takes place within the loop over y:
2339  *
2340  \code
2341  int f[height][width];
2342  for (int y = 0; y < height; y++) {
2343  int g[2][width+1];
2344  for (int x = 0; x < width; x++) {
2345  g[0][x] = x*y;
2346  g[0][x+1] = (x+1)*y;
2347  g[1][x] = x*(y+1);
2348  g[1][x+1] = (x+1)*(y+1);
2349  f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2350  }
2351  }
2352  \endcode
2353  *
2354  * Provided the for loop over x is serial, halide then
2355  * automatically performs the following sliding window
2356  * optimization:
2357  *
2358  \code
2359  int f[height][width];
2360  for (int y = 0; y < height; y++) {
2361  int g[2][width+1];
2362  for (int x = 0; x < width; x++) {
2363  if (x == 0) {
2364  g[0][x] = x*y;
2365  g[1][x] = x*(y+1);
2366  }
2367  g[0][x+1] = (x+1)*y;
2368  g[1][x+1] = (x+1)*(y+1);
2369  f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2370  }
2371  }
2372  \endcode
2373  *
2374  * Two of the assignments to g only need to be done when x is
2375  * zero. The rest of the time, those sites have already been
2376  * filled in by a previous iteration. This version has the
2377  * locality of compute_at(f, x), but allocates more memory and
2378  * does much less redundant work.
2379  *
2380  * Halide then further optimizes this pipeline like so:
2381  *
2382  \code
2383  int f[height][width];
2384  for (int y = 0; y < height; y++) {
2385  int g[2][2];
2386  for (int x = 0; x < width; x++) {
2387  if (x == 0) {
2388  g[0][0] = x*y;
2389  g[1][0] = x*(y+1);
2390  }
2391  g[0][(x+1)%2] = (x+1)*y;
2392  g[1][(x+1)%2] = (x+1)*(y+1);
2393  f[y][x] = g[0][x%2] + g[1][x%2] + g[0][(x+1)%2] + g[1][(x+1)%2];
2394  }
2395  }
2396  \endcode
2397  *
2398  * Halide has detected that it's possible to use a circular buffer
2399  * to represent g, and has reduced all accesses to g modulo 2 in
2400  * the x dimension. This optimization only triggers if the for
2401  * loop over x is serial, and if halide can statically determine
2402  * some power of two large enough to cover the range needed. For
2403  * powers of two, the modulo operator compiles to more efficient
2404  * bit-masking. This optimization reduces memory usage, and also
2405  * improves locality by reusing recently-accessed memory instead
2406  * of pulling new memory into cache.
2407  *
2408  */
2409  Func &store_at(const Func &f, const Var &var);
2410 
2411  /** Equivalent to the version of store_at that takes a Var, but
2412  * schedules storage within the loop over a dimension of a
2413  * reduction domain */
2414  Func &store_at(const Func &f, const RVar &var);
2415 
2416  /** Equivalent to the version of store_at that takes a Var, but
2417  * schedules storage at a given LoopLevel. */
2418  Func &store_at(LoopLevel loop_level);
2419 
2420  /** Equivalent to \ref Func::store_at, but schedules storage
2421  * outside the outermost loop. */
2423 
2424  /** Aggressively inline all uses of this function. This is the
2425  * default schedule, so you're unlikely to need to call this. For
2426  * a Func with an update definition, that means it gets computed
2427  * as close to the innermost loop as possible.
2428  *
2429  * Consider once more the pipeline from \ref Func::compute_at :
2430  *
2431  \code
2432  Func f, g;
2433  Var x, y;
2434  g(x, y) = x*y;
2435  f(x, y) = g(x, y) + g(x+1, y) + g(x, y+1) + g(x+1, y+1);
2436  \endcode
2437  *
2438  * Leaving g as inline, this compiles to code equivalent to the following C:
2439  *
2440  \code
2441  int f[height][width];
2442  for (int y = 0; y < height; y++) {
2443  for (int x = 0; x < width; x++) {
2444  f[y][x] = x*y + x*(y+1) + (x+1)*y + (x+1)*(y+1);
2445  }
2446  }
2447  \endcode
2448  */
2450 
2451  /** Get a handle on an update step for the purposes of scheduling
2452  * it. */
2453  Stage update(int idx = 0);
2454 
2455  /** Set the type of memory this Func should be stored in. Controls
2456  * whether allocations go on the stack or the heap on the CPU, and
2457  * in global vs shared vs local on the GPU. See the documentation
2458  * on MemoryType for more detail. */
2459  Func &store_in(MemoryType memory_type);
2460 
2461  /** Trace all loads from this Func by emitting calls to
2462  * halide_trace. If the Func is inlined, this has no
2463  * effect. */
2465 
2466  /** Trace all stores to the buffer backing this Func by emitting
2467  * calls to halide_trace. If the Func is inlined, this call
2468  * has no effect. */
2470 
2471  /** Trace all realizations of this Func by emitting calls to
2472  * halide_trace. */
2474 
2475  /** Add a string of arbitrary text that will be passed thru to trace
2476  * inspection code if the Func is realized in trace mode. (Funcs that are
2477  * inlined won't have their tags emitted.) Ignored entirely if
2478  * tracing is not enabled for the Func (or globally).
2479  */
2480  Func &add_trace_tag(const std::string &trace_tag);
2481 
2482  /** Get a handle on the internal halide function that this Func
2483  * represents. Useful if you want to do introspection on Halide
2484  * functions */
2485  Internal::Function function() const {
2486  return func;
2487  }
2488 
2489  /** You can cast a Func to its pure stage for the purposes of
2490  * scheduling it. */
2491  operator Stage() const;
2492 
2493  /** Get a handle on the output buffer for this Func. Only relevant
2494  * if this is the output Func in a pipeline. Useful for making
2495  * static promises about strides, mins, and extents. */
2496  // @{
2498  std::vector<OutputImageParam> output_buffers() const;
2499  // @}
2500 
2501  /** Use a Func as an argument to an external stage. */
2502  operator ExternFuncArgument() const;
2503 
2504  /** Infer the arguments to the Func, sorted into a canonical order:
2505  * all buffers (sorted alphabetically by name), followed by all non-buffers
2506  * (sorted alphabetically by name).
2507  This lets you write things like:
2508  \code
2509  func.compile_to_assembly("/dev/stdout", func.infer_arguments());
2510  \endcode
2511  */
2512  std::vector<Argument> infer_arguments() const;
2513 
2514  /** Get the source location of the pure definition of this
2515  * Func. See Stage::source_location() */
2516  std::string source_location() const;
2517 
2518  /** Return the current StageSchedule associated with this initial
2519  * Stage of this Func. For introspection only: to modify schedule,
2520  * use the Func interface. */
2522  return Stage(*this).get_schedule();
2523  }
2524 };
2525 
2526 namespace Internal {
2527 
2528 template<typename Last>
2529 inline void check_types(const Tuple &t, int idx) {
2530  using T = typename std::remove_pointer<typename std::remove_reference<Last>::type>::type;
2531  user_assert(t[idx].type() == type_of<T>())
2532  << "Can't evaluate expression "
2533  << t[idx] << " of type " << t[idx].type()
2534  << " as a scalar of type " << type_of<T>() << "\n";
2535 }
2536 
2537 template<typename First, typename Second, typename... Rest>
2538 inline void check_types(const Tuple &t, int idx) {
2539  check_types<First>(t, idx);
2540  check_types<Second, Rest...>(t, idx + 1);
2541 }
2542 
2543 template<typename Last>
2544 inline void assign_results(Realization &r, int idx, Last last) {
2545  using T = typename std::remove_pointer<typename std::remove_reference<Last>::type>::type;
2546  *last = Buffer<T>(r[idx])();
2547 }
2548 
2549 template<typename First, typename Second, typename... Rest>
2550 inline void assign_results(Realization &r, int idx, First first, Second second, Rest &&...rest) {
2551  assign_results<First>(r, idx, first);
2552  assign_results<Second, Rest...>(r, idx + 1, second, rest...);
2553 }
2554 
2555 } // namespace Internal
2556 
2557 /** JIT-Compile and run enough code to evaluate a Halide
2558  * expression. This can be thought of as a scalar version of
2559  * \ref Func::realize */
2560 template<typename T>
2562  user_assert(e.type() == type_of<T>())
2563  << "Can't evaluate expression "
2564  << e << " of type " << e.type()
2565  << " as a scalar of type " << type_of<T>() << "\n";
2566  Func f;
2567  f() = e;
2568  Buffer<T, 0> im = f.realize(ctx);
2569  return im();
2570 }
2571 
2572 /** evaluate with a default user context */
2573 template<typename T>
2575  return evaluate<T>(nullptr, e);
2576 }
2577 
2578 /** JIT-compile and run enough code to evaluate a Halide Tuple. */
2579 template<typename First, typename... Rest>
2580 HALIDE_NO_USER_CODE_INLINE void evaluate(JITUserContext *ctx, Tuple t, First first, Rest &&...rest) {
2581  Internal::check_types<First, Rest...>(t, 0);
2582 
2583  Func f;
2584  f() = t;
2585  Realization r = f.realize(ctx);
2586  Internal::assign_results(r, 0, first, rest...);
2587 }
2588 
2589 /** JIT-compile and run enough code to evaluate a Halide Tuple. */
2590 template<typename First, typename... Rest>
2591 HALIDE_NO_USER_CODE_INLINE void evaluate(Tuple t, First first, Rest &&...rest) {
2592  evaluate<First, Rest...>(nullptr, std::move(t), std::forward<First>(first), std::forward<Rest...>(rest...));
2593 }
2594 
2595 namespace Internal {
2596 
2597 inline void schedule_scalar(Func f) {
2599  if (t.has_gpu_feature()) {
2600  f.gpu_single_thread();
2601  }
2602  if (t.has_feature(Target::HVX)) {
2603  f.hexagon();
2604  }
2605 }
2606 
2607 } // namespace Internal
2608 
2609 /** JIT-Compile and run enough code to evaluate a Halide
2610  * expression. This can be thought of as a scalar version of
2611  * \ref Func::realize. Can use GPU if jit target from environment
2612  * specifies one.
2613  */
2614 template<typename T>
2616  user_assert(e.type() == type_of<T>())
2617  << "Can't evaluate expression "
2618  << e << " of type " << e.type()
2619  << " as a scalar of type " << type_of<T>() << "\n";
2620  Func f;
2621  f() = e;
2623  Buffer<T, 0> im = f.realize();
2624  return im();
2625 }
2626 
2627 /** JIT-compile and run enough code to evaluate a Halide Tuple. Can
2628  * use GPU if jit target from environment specifies one. */
2629 // @{
2630 template<typename First, typename... Rest>
2631 HALIDE_NO_USER_CODE_INLINE void evaluate_may_gpu(Tuple t, First first, Rest &&...rest) {
2632  Internal::check_types<First, Rest...>(t, 0);
2633 
2634  Func f;
2635  f() = t;
2637  Realization r = f.realize();
2638  Internal::assign_results(r, 0, first, rest...);
2639 }
2640 // @}
2641 
2642 } // namespace Halide
2643 
2644 #endif
Defines a type used for expressing the type signature of a generated halide pipeline.
#define internal_assert(c)
Definition: Errors.h:19
#define user_assert(c)
Definition: Errors.h:15
Base classes for Halide expressions (Halide::Expr) and statements (Halide::Internal::Stmt)
Defines the struct representing lifetime and dependencies of a JIT compiled halide pipeline.
Defines Module, an IR container that fully describes a Halide program.
Classes for declaring scalar parameters to halide pipelines.
Defines the front-end class representing an entire Halide imaging pipeline.
Defines the front-end syntax for reduction domains and reduction variables.
Defines the structure that describes a Halide target.
Defines Tuple - the front-end handle on small arrays of expressions.
#define HALIDE_NO_USER_CODE_INLINE
Definition: Util.h:45
Defines the Var - the front-end variable.
A Halide::Buffer is a named shared reference to a Halide::Runtime::Buffer.
Definition: Buffer.h:120
Helper class for identifying purpose of an Expr passed to memoize.
Definition: Func.h:688
EvictionKey(const Expr &expr=Expr())
Definition: Func.h:694
A halide function.
Definition: Func.h:703
void print_loop_nest()
Write out the loop nests specified by the schedule for this Function.
Func & unroll(const VarOrRVar &var)
Mark a dimension to be completely unrolled.
bool is_extern() const
Is this function an external stage? That is, was it defined using define_extern?
FuncRef operator()(std::vector< Expr >) const
Either calls to the function, or the left-hand-side of an update definition (see RDom).
Func & hexagon(const VarOrRVar &x=Var::outermost())
Schedule for execution on Hexagon.
Func(const std::string &name)
Declare a new undefined function with the given name.
void compile_to_multitarget_object_files(const std::string &filename_prefix, const std::vector< Argument > &args, const std::vector< Target > &targets, const std::vector< std::string > &suffixes)
Like compile_to_multitarget_static_library(), except that the object files are all output as object f...
Func & align_extent(const Var &var, Expr modulus)
Expand the region computed so that the extent is a multiple of 'modulus'.
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< Var, Args... >::value, FuncRef >::type operator()(Args &&...args) const
Definition: Func.h:1258
Func & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Split two dimensions at once by the given factors, and then reorder the resulting dimensions to be xi...
void specialize_fail(const std::string &message)
Add a specialization to a Func that always terminates execution with a call to halide_error().
Func & memoize(const EvictionKey &eviction_key=EvictionKey())
Use the halide_memoization_cache_...
void compile_to_assembly(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to text assembly equivalent to the object file generated by compile_...
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z, const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz, const Expr &x_size, const Expr &y_size, const Expr &z_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & allow_race_conditions()
Specify that race conditions are permitted for this Func, which enables parallelizing over RVars even...
bool has_update_definition() const
Does this function have at least one update definition?
void compile_jit(const Target &target=get_jit_target_from_environment())
Eagerly jit compile the function to machine code.
Func & bound_storage(const Var &dim, const Expr &bound)
Bound the extent of a Func's storage, but not extent of its compute.
Func()
Declare a new undefined function with an automatically-generated unique name.
Func & async()
Produce this Func asynchronously in a separate thread.
void compile_to_bitcode(const std::string &filename, const std::vector< Argument > &, const Target &target=get_target_from_environment())
void realize(Pipeline::RealizationArg outputs, const Target &target=Target(), const ParamMap &param_map=ParamMap::empty_map())
Evaluate this function into an existing allocated buffer or buffers.
void set_custom_trace(int(*trace_fn)(void *, const halide_trace_event_t *))
Func & gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & compute_root()
Compute all of this function once ahead of time.
Func & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, TailStrategy tail=TailStrategy::Auto)
Generalized tiling, reusing the previous names as the outer names.
Func & gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU block indices and thread indices.
Func & compute_with(const Stage &s, const VarOrRVar &var, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy >> &align)
Schedule the iteration over the initial definition of this function to be fused with another stage 's...
void compile_to_lowered_stmt(const std::string &filename, const std::vector< Argument > &args, StmtOutputFormat fmt=Text, const Target &target=get_target_from_environment())
Write out an internal representation of lowered code.
void compile_to_c(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name="", const Target &target=get_target_from_environment())
Statically compile this function to C source code.
Func & fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused)
Join two dimensions into a single fused dimenion.
Func & fold_storage(const Var &dim, const Expr &extent, bool fold_forward=true)
Store realizations of this function in a circular buffer of a given extent.
Func & store_at(LoopLevel loop_level)
Equivalent to the version of store_at that takes a Var, but schedules storage at a given LoopLevel.
Stage update(int idx=0)
Get a handle on an update step for the purposes of scheduling it.
Func & reorder_storage(const Var &x, const Var &y)
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< Expr, Args... >::value, FuncRef >::type operator()(const Expr &x, Args &&...args) const
Definition: Func.h:1275
bool defined() const
Does this function have at least a pure definition.
Func & compute_at(LoopLevel loop_level)
Schedule a function to be computed within the iteration over a given LoopLevel.
const Internal::StageSchedule & get_schedule() const
Return the current StageSchedule associated with this initial Stage of this Func.
Definition: Func.h:2521
Func & gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU block indices.
Func & store_at(const Func &f, const Var &var)
Allocate storage for this function within f's loop over var.
Func copy_to_host()
Declare that this function should be implemented by a call to halide_buffer_copy with a NULL target d...
Func & split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Split a dimension into inner and outer subdimensions with the given names, where the inner dimension ...
Func & prefetch(const Func &f, const VarOrRVar &var, int offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Prefetch data written to or read from a Func or an ImageParam by a subsequent loop iteration,...
Definition: Func.h:1975
Func & compute_with(LoopLevel loop_level, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy >> &align)
std::vector< Argument > infer_arguments() const
Infer the arguments to the Func, sorted into a canonical order: all buffers (sorted alphabetically by...
void compile_to_header(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name="", const Target &target=get_target_from_environment())
Emit a header file with the given filename for this function.
Func & prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Definition: Func.h:2074
std::vector< Var > args() const
Get the pure arguments.
Func(const Expr &e)
Declare a new function with an automatically-generated unique name, and define it to return the given...
Func & add_trace_tag(const std::string &trace_tag)
Add a string of arbitrary text that will be passed thru to trace inspection code if the Func is reali...
int dimensions() const
The dimensionality (number of arguments) of this function.
void realize(JITUserContext *context, Pipeline::RealizationArg outputs, const Target &target=Target(), const ParamMap &param_map=ParamMap::empty_map())
Same as above, but takes a custom user-provided context to be passed to runtime functions.
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< Var, Args... >::value, Func & >::type reorder_storage(const Var &x, const Var &y, Args &&...args)
Definition: Func.h:2100
void set_custom_do_par_for(int(*custom_do_par_for)(void *, int(*)(void *, int, uint8_t *), int, int, uint8_t *))
Func & align_bounds(const Var &var, Expr modulus, Expr remainder=0)
Expand the region computed so that the min coordinates is congruent to 'remainder' modulo 'modulus',...
std::string source_location() const
Get the source location of the pure definition of this Func.
Func & compute_with(LoopLevel loop_level, LoopAlignStrategy align=LoopAlignStrategy::Auto)
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< VarOrRVar, Args... >::value, Func & >::type reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args)
Definition: Func.h:1579
void infer_input_bounds(const std::vector< int32_t > &sizes, const Target &target=get_jit_target_from_environment(), const ParamMap &param_map=ParamMap::empty_map())
For a given size of output, or a given output buffer, determine the bounds required of all unbound Im...
Func & store_root()
Equivalent to Func::store_at, but schedules storage outside the outermost loop.
int outputs() const
Get the number of outputs of this Func.
void set_custom_allocator(void *(*malloc)(void *, size_t), void(*free)(void *, void *))
Tuple update_values(int idx=0) const
Get the right-hand-side of an update definition for functions that returns multiple values.
void compile_to_bitcode(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to llvm bitcode, with the given filename (which should probably end ...
int num_update_definitions() const
How many update definitions does this function have?
Func & rename(const VarOrRVar &old_name, const VarOrRVar &new_name)
Rename a dimension.
Func & vectorize(const VarOrRVar &var)
Mark a dimension to be computed all-at-once as a single vector.
Func & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &outers, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, const std::vector< TailStrategy > &tails)
A more general form of tile, which defines tiles of any dimensionality.
Func & prefetch(const Internal::Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Func & bound_extent(const Var &var, Expr extent)
Bound the extent of a Func's realization, but not its min.
Func & trace_stores()
Trace all stores to the buffer backing this Func by emitting calls to halide_trace.
Func & set_estimates(const Region &estimates)
Set (min, extent) estimates for all dimensions in the Func at once; this is equivalent to calling set...
Stage specialize(const Expr &condition)
Specialize a Func.
Func & compute_at(const Func &f, const Var &var)
Compute this function as needed for each unique value of the given var for the given calling function...
void set_custom_do_task(int(*custom_do_task)(void *, int(*)(void *, int, uint8_t *), int, uint8_t *))
Func & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &outers, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, TailStrategy tail=TailStrategy::Auto)
The generalized tile, with a single tail strategy to apply to all vars.
Func & reorder_storage(const std::vector< Var > &dims)
Specify how the storage for the function is laid out.
Func & compute_at(const Func &f, const RVar &var)
Schedule a function to be computed within the iteration over some dimension of an update domain.
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Short-hand for tiling a domain and mapping the tile indices to GPU block indices and the coordinates ...
const std::vector< Expr > & update_args(int idx=0) const
Get the left-hand-side of the update definition.
Func & gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & store_at(const Func &f, const RVar &var)
Equivalent to the version of store_at that takes a Var, but schedules storage within the loop over a ...
Realization realize(std::vector< int32_t > sizes={}, const Target &target=Target(), const ParamMap &param_map=ParamMap::empty_map())
Evaluate this function over some rectangular domain and return the resulting buffer or buffers.
HALIDE_NO_USER_CODE_INLINE Func(Buffer< T, Dims > &im)
Construct a new Func to wrap a Buffer.
Definition: Func.h:749
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, const std::vector< Type > &types, const std::vector< Var > &arguments, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Func & parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail=TailStrategy::Auto)
Split a dimension by the given task_size, and the parallelize the outer dimension.
JITHandlers & jit_handlers()
Get a struct containing the currently set custom functions used by JIT.
Expr value() const
The right-hand-side value of the pure definition of this function.
Func & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
A shorter form of tile, which reuses the old variable names as the new outer dimensions.
void set_error_handler(void(*handler)(void *, const char *))
Deprecated variants of the above that use a void pointer instead of a JITUserContext pointer.
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func clone_in(const std::vector< Func > &fs)
Module compile_to_module(const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Store an internal representation of lowered code as a self contained Module suitable for further comp...
void infer_input_bounds(JITUserContext *context, const std::vector< int32_t > &sizes, const Target &target=get_jit_target_from_environment(), const ParamMap &param_map=ParamMap::empty_map())
Versions of infer_input_bounds that take a custom user context to pass to runtime functions.
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, const std::vector< Type > &types, int dimensionality, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Definition: Func.h:1205
void set_custom_print(void(*handler)(void *, const char *))
Func in()
Create and return a global identity wrapper, which wraps all calls to this Func by any other Func.
Func & vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Split a dimension by the given factor, then vectorize the inner dimension.
OutputImageParam output_buffer() const
Get a handle on the output buffer for this Func.
Expr update_value(int idx=0) const
Get the right-hand-side of an update definition.
Func & bound(const Var &var, Expr min, Expr extent)
Statically declare that the range over which a function should be evaluated is given by the second an...
void compile_to(const std::map< OutputFileType, std::string > &output_files, const std::vector< Argument > &args, const std::string &fn_name, const Target &target=get_target_from_environment())
Compile and generate multiple target files with single call.
void compile_to_llvm_assembly(const std::string &filename, const std::vector< Argument > &, const Target &target=get_target_from_environment())
Func & gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
void add_custom_lowering_pass(T *pass)
Add a custom pass to be used during lowering.
Definition: Func.h:1092
Func in(const std::vector< Func > &fs)
Create and return an identity wrapper shared by all the Funcs in 'fs'.
Func & gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
void infer_input_bounds(JITUserContext *context, Pipeline::RealizationArg outputs, const Target &target=get_jit_target_from_environment(), const ParamMap &param_map=ParamMap::empty_map())
Func & parallel(const VarOrRVar &var)
Mark a dimension to be traversed in parallel.
Func & serial(const VarOrRVar &var)
Mark a dimension to be traversed serially.
Func & prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
prefetch() is a more fine-grained version of prefetch(), which allows specification of different vars...
const std::string & name() const
The name of this function, either given during construction, or automatically generated.
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, Type t, int dimensionality, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Add an extern definition for this Func.
Definition: Func.h:1187
Func & align_storage(const Var &dim, const Expr &alignment)
Pad the storage extent of a particular dimension of realizations of this function up to be a multiple...
void compile_to_file(const std::string &filename_prefix, const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Compile to object file and header pair, with the given arguments.
Func & gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU thread indices.
void add_custom_lowering_pass(Internal::IRMutator *pass, std::function< void()> deleter)
Add a custom pass to be used during lowering, with the function that will be called to delete it also...
void clear_custom_lowering_passes()
Remove all previously-set custom lowering passes.
void compile_to_llvm_assembly(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to llvm assembly, with the given filename (which should probably end...
void compile_to_multitarget_static_library(const std::string &filename_prefix, const std::vector< Argument > &args, const std::vector< Target > &targets)
Compile to static-library file and header pair once for each target; each resulting function will be ...
Func & gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
The given dimension corresponds to the lanes in a GPU warp.
std::vector< OutputImageParam > output_buffers() const
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z, const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz, const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz, const Expr &x_size, const Expr &y_size, const Expr &z_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & store_in(MemoryType memory_type)
Set the type of memory this Func should be stored in.
Realization realize(JITUserContext *context, std::vector< int32_t > sizes={}, const Target &target=Target(), const ParamMap &param_map=ParamMap::empty_map())
Same as above, but takes a custom user-provided context to be passed to runtime functions.
void compile_to_assembly(const std::string &filename, const std::vector< Argument > &, const Target &target=get_target_from_environment())
Func clone_in(const Func &f)
Similar to Func::in; however, instead of replacing the call to this Func with an identity Func that r...
std::vector< RVar > rvars(int idx=0) const
Get the RVars of the reduction domain for an update definition, if there is one.
Func & gpu_single_thread(DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide to run this stage using a single gpu thread and block.
Func(Internal::Function f)
Construct a new Func to wrap an existing, already-define Function object.
void compile_to_object(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to an object file, with the given filename (which should probably en...
const std::string & extern_function_name() const
Get the name of the extern function called for an extern definition.
Func & compute_with(const Stage &s, const VarOrRVar &var, LoopAlignStrategy align=LoopAlignStrategy::Auto)
Func & trace_realizations()
Trace all realizations of this Func by emitting calls to halide_trace.
Tuple values() const
The values returned by this function.
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &tx, const VarOrRVar &ty, const Expr &x_size, const Expr &y_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & compute_inline()
Aggressively inline all uses of this function.
const std::vector< Type > & output_types() const
Get the types of the outputs of this Func.
Func copy_to_device(DeviceAPI d=DeviceAPI::Default_GPU)
Declare that this function should be implemented by a call to halide_buffer_copy with the given targe...
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &tx, const VarOrRVar &ty, const Expr &x_size, const Expr &y_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
void compile_to_object(const std::string &filename, const std::vector< Argument > &, const Target &target=get_target_from_environment())
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, Type t, const std::vector< Var > &arguments, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Definition: Func.h:1215
Func & reorder(const std::vector< VarOrRVar > &vars)
Reorder variables to have the given nesting order, from innermost out.
Func & atomic(bool override_associativity_test=false)
Issue atomic updates for this Func.
const std::vector< CustomLoweringPass > & custom_lowering_passes()
Get the custom lowering passes.
void debug_to_file(const std::string &filename)
When this function is compiled, include code that dumps its values to a file after it is realized,...
Func & gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func in(const Func &f)
Creates and returns a new identity Func that wraps this Func.
void compile_to_static_library(const std::string &filename_prefix, const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Compile to static-library file and header pair, with the given arguments.
Func & set_estimate(const Var &var, const Expr &min, const Expr &extent)
Statically declare the range over which the function will be evaluated in the general case.
Func & unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Split a dimension by the given factor, then unroll the inner dimension.
void infer_input_bounds(Pipeline::RealizationArg outputs, const Target &target=get_jit_target_from_environment(), const ParamMap &param_map=ParamMap::empty_map())
Func & trace_loads()
Trace all loads from this Func by emitting calls to halide_trace.
FuncRef operator()(std::vector< Var >) const
Construct either the left-hand-side of a definition, or a call to a functions that happens to only co...
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, const std::vector< Type > &types, int dimensionality, NameMangling mangling)
Definition: Func.h:1197
A fragment of front-end syntax of the form f(x, y, z), where x, y, z are Vars or Exprs.
Definition: Func.h:494
Stage operator*=(const FuncRef &)
FuncTupleElementRef operator[](int) const
When a FuncRef refers to a function that provides multiple outputs, you can access each output as an ...
Stage operator-=(const FuncRef &)
size_t size() const
How many outputs does the function this refers to produce.
Internal::Function function() const
What function is this calling?
Definition: Func.h:591
Stage operator+=(Expr)
Define a stage that adds the given expression to this Func.
Stage operator-=(Expr)
Define a stage that adds the negative of the given expression to this Func.
Stage operator*=(Expr)
Define a stage that multiplies this Func by the given expression.
Stage operator-=(const Tuple &)
Stage operator/=(Expr)
Define a stage that divides this Func by the given expression.
Stage operator+=(const FuncRef &)
Stage operator=(const Expr &)
Use this as the left-hand-side of a definition or an update definition (see RDom).
Stage operator=(const FuncRef &)
FuncRef(Internal::Function, const std::vector< Var > &, int placeholder_pos=-1, int count=0)
Stage operator+=(const Tuple &)
FuncRef(const Internal::Function &, const std::vector< Expr > &, int placeholder_pos=-1, int count=0)
Stage operator/=(const FuncRef &)
Stage operator*=(const Tuple &)
Stage operator/=(const Tuple &)
Stage operator=(const Tuple &)
Use this as the left-hand-side of a definition or an update definition for a Func with multiple outpu...
A fragment of front-end syntax of the form f(x, y, z)[index], where x, y, z are Vars or Exprs.
Definition: Func.h:613
int index() const
Return index to the function outputs.
Definition: Func.h:677
Stage operator+=(const Expr &e)
Define a stage that adds the given expression to Tuple component 'idx' of this Func.
Stage operator*=(const Expr &e)
Define a stage that multiplies Tuple component 'idx' of this Func by the given expression.
Stage operator/=(const Expr &e)
Define a stage that divides Tuple component 'idx' of this Func by the given expression.
Stage operator=(const Expr &e)
Use this as the left-hand-side of an update definition of Tuple component 'idx' of a Func (see RDom).
Stage operator=(const FuncRef &e)
Stage operator-=(const Expr &e)
Define a stage that adds the negative of the given expression to Tuple component 'idx' of this Func.
FuncTupleElementRef(const FuncRef &ref, const std::vector< Expr > &args, int idx)
An Image parameter to a halide pipeline.
Definition: ImageParam.h:23
A Function definition which can either represent a init or an update definition.
Definition: Definition.h:38
const StageSchedule & schedule() const
Get the default (no-specialization) stage-specific schedule associated with this definition.
const std::vector< Expr > & args() const
Get the default (no-specialization) arguments (left-hand-side) of the definition.
bool defined() const
Definition objects are nullable.
A reference-counted handle to Halide's internal representation of a function.
Definition: Function.h:38
A base class for passes over the IR which modify it (e.g.
Definition: IRMutator.h:26
A reference-counted handle to a parameter to a halide pipeline.
Definition: Parameter.h:28
A schedule for a single stage of a Halide pipeline.
Definition: Schedule.h:646
A reference to a site in a Halide statement at the top of the body of a particular for loop.
Definition: Schedule.h:176
A halide module.
Definition: Module.h:172
A handle on the output buffer of a pipeline.
static const ParamMap & empty_map()
A const ref to an empty ParamMap.
Definition: ParamMap.h:104
A class representing a Halide pipeline.
Definition: Pipeline.h:99
A multi-dimensional domain over which to iterate.
Definition: RDom.h:193
A reduction variable represents a single dimension of a reduction domain (RDom).
Definition: RDom.h:29
const std::string & name() const
The name of this reduction variable.
A Realization is a vector of references to existing Buffer objects.
Definition: Realization.h:19
A single definition of a Func.
Definition: Func.h:70
Stage & prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Definition: Func.h:465
std::string name() const
Return the name of this stage, e.g.
Stage & rename(const VarOrRVar &old_name, const VarOrRVar &new_name)
Stage & gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Stage & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &outers, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, TailStrategy tail=TailStrategy::Auto)
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< VarOrRVar, Args... >::value, Stage & >::type reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args)
Definition: Func.h:378
Stage & gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &tx, const VarOrRVar &ty, const Expr &x_size, const Expr &y_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & hexagon(const VarOrRVar &x=Var::outermost())
Func rfactor(const RVar &r, const Var &v)
Stage & compute_with(const Stage &s, const VarOrRVar &var, LoopAlignStrategy align=LoopAlignStrategy::Auto)
Stage & vectorize(const VarOrRVar &var)
Stage & gpu_single_thread(DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & prefetch(const Func &f, const VarOrRVar &var, int offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Definition: Func.h:445
Stage & compute_with(LoopLevel loop_level, LoopAlignStrategy align=LoopAlignStrategy::Auto)
Stage & prefetch(const Internal::Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Stage & unroll(const VarOrRVar &var)
Stage & parallel(const VarOrRVar &var)
Stage & allow_race_conditions()
Stage & serial(const VarOrRVar &var)
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z, const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz, const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz, const Expr &x_size, const Expr &y_size, const Expr &z_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &outers, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, const std::vector< TailStrategy > &tails)
Stage specialize(const Expr &condition)
Stage & compute_with(LoopLevel loop_level, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy >> &align)
Schedule the iteration over this stage to be fused with another stage 's' from outermost loop to a gi...
Stage & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Stage & split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Scheduling calls that control how the domain of this stage is traversed.
Stage & fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused)
Stage(Internal::Function f, Internal::Definition d, size_t stage_index)
Definition: Func.h:94
Stage & vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Func rfactor(std::vector< std::pair< RVar, Var >> preserved)
Calling rfactor() on an associative update definition a Func will split the update into an intermedia...
Stage & parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail=TailStrategy::Auto)
Stage & gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
const Internal::StageSchedule & get_schedule() const
Return the current StageSchedule associated with this Stage.
Definition: Func.h:107
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z, const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz, const Expr &x_size, const Expr &y_size, const Expr &z_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & reorder(const std::vector< VarOrRVar > &vars)
Stage & gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, TailStrategy tail=TailStrategy::Auto)
void specialize_fail(const std::string &message)
Stage & gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Stage & compute_with(const Stage &s, const VarOrRVar &var, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy >> &align)
Stage & unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Stage & atomic(bool override_associativity_test=false)
std::string source_location() const
Attempt to get the source file and line where this stage was defined by parsing the process's own deb...
Stage & gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &tx, const VarOrRVar &ty, const Expr &x_size, const Expr &y_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
std::string dump_argument_list() const
Return a string describing the current var list taking into account all the splits,...
void unscheduled()
Assert that this stage has intentionally been given no schedule, and suppress the warning about unsch...
Create a small array of Exprs for defining and calling functions with multiple outputs.
Definition: Tuple.h:18
A Halide variable, to be used when defining functions.
Definition: Var.h:19
const std::string & name() const
Get the name of a Var.
static Var outermost()
A Var that represents the location outside the outermost loop.
Definition: Var.h:163
void schedule_scalar(Func f)
Definition: Func.h:2597
void assign_results(Realization &r, int idx, Last last)
Definition: Func.h:2544
void check_types(const Tuple &t, int idx)
Definition: Func.h:2529
ForType
An enum describing a type of loop traversal.
Definition: Expr.h:399
std::vector< Var > make_argument_list(int dimensionality)
Make a list of unique arguments for definitions with unnamed arguments.
WEAK halide_do_task_t custom_do_task
WEAK halide_do_par_for_t custom_do_par_for
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
@ Internal
Not visible externally, similar to 'static' linkage in C.
class HALIDE_ATTRIBUTE_DEPRECATED("Use OutputFileType instead of Output") Output
Definition: Module.h:46
PrefetchBoundStrategy
Different ways to handle accesses outside the original extents in a prefetch.
@ GuardWithIf
Guard the prefetch with if-guards that ignores the prefetch if any of the prefetched region ever goes...
HALIDE_NO_USER_CODE_INLINE T evaluate_may_gpu(const Expr &e)
JIT-Compile and run enough code to evaluate a Halide expression.
Definition: Func.h:2615
TailStrategy
Different ways to handle a tail case in a split when the factor does not provably divide the extent.
Definition: Schedule.h:32
@ Auto
For pure definitions use ShiftInwards.
LoopAlignStrategy
Different ways to handle the case when the start/end of the loops of stages computed with (fused) are...
Definition: Schedule.h:110
@ Auto
By default, LoopAlignStrategy is set to NoAlign.
Expr min(const FuncRef &a, const FuncRef &b)
Explicit overloads of min and max for FuncRef.
Definition: Func.h:600
NameMangling
An enum to specify calling convention for extern stages.
Definition: Function.h:24
@ Default
Match whatever is specified in the Target.
Target get_jit_target_from_environment()
Return the target that Halide will use for jit-compilation.
DeviceAPI
An enum describing a type of device API.
Definition: DeviceAPI.h:15
@ Host
Used to denote for loops that run on the same device as the containing code.
Target get_target_from_environment()
Return the target that Halide will use.
StmtOutputFormat
Used to determine if the output printed to file should be as a normal string or as an HTML file which...
Definition: Pipeline.h:63
@ Text
Definition: Pipeline.h:64
Stage ScheduleHandle
Definition: Func.h:485
std::vector< Range > Region
A multi-dimensional box.
Definition: Expr.h:343
Expr max(const FuncRef &a, const FuncRef &b)
Definition: Func.h:603
MemoryType
An enum describing different address spaces to be used with Func::store_in.
Definition: Expr.h:346
HALIDE_NO_USER_CODE_INLINE T evaluate(JITUserContext *ctx, const Expr &e)
JIT-Compile and run enough code to evaluate a Halide expression.
Definition: Func.h:2561
void * malloc(size_t)
unsigned __INT8_TYPE__ uint8_t
void free(void *)
A fragment of Halide syntax.
Definition: Expr.h:256
HALIDE_ALWAYS_INLINE Type type() const
Get the type of this expression node.
Definition: Expr.h:320
An argument to an extern-defined Func.
A set of custom overrides of runtime functions.
Definition: JITModule.h:33
A context to be passed to Pipeline::realize.
Definition: JITModule.h:134
A struct representing a target machine and os to generate code for.
Definition: Target.h:19
bool has_gpu_feature() const
Is a fully feature GPU compute runtime enabled? I.e.
bool has_feature(Feature f) const
Types in the halide type system.
Definition: Type.h:266
A class that can represent Vars or RVars.
Definition: Func.h:30
bool is_rvar
Definition: Func.h:58
VarOrRVar(const Var &v)
Definition: Func.h:34
VarOrRVar(const RVar &r)
Definition: Func.h:37
const std::string & name() const
Definition: Func.h:48
VarOrRVar(const std::string &n, bool r)
Definition: Func.h:31
VarOrRVar(const ImplicitVar< N > &u)
Definition: Func.h:44
VarOrRVar(const RDom &r)
Definition: Func.h:40