ViennaCL - The Vienna Computing Library  1.5.2
vector_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_VECTOR_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_OPENCL_VECTOR_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include <cmath>
26 
27 #include "viennacl/forwards.h"
28 #include "viennacl/ocl/device.hpp"
29 #include "viennacl/ocl/handle.hpp"
30 #include "viennacl/ocl/kernel.hpp"
31 #include "viennacl/scalar.hpp"
32 #include "viennacl/tools/tools.hpp"
38 #include "viennacl/traits/size.hpp"
42 
43 namespace viennacl
44 {
45  namespace linalg
46  {
47  namespace opencl
48  {
49  //
50  // Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
51  //
52 
53 
54  template <typename T, typename ScalarType1>
55  void av(vector_base<T> & vec1,
56  vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
57  {
58  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
59 
60  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
62 
63  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
64 
66  (viennacl::is_cpu_scalar<ScalarType1>::value ? "av_cpu" : "av_gpu"));
67  k.global_work_size(0, std::min<vcl_size_t>(128 * k.local_work_size(),
68  viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size(vec1), k.local_work_size()) ) );
69 
71  size_vec1.start = cl_uint(viennacl::traits::start(vec1));
72  size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
73  size_vec1.size = cl_uint(viennacl::traits::size(vec1));
74  size_vec1.internal_size = cl_uint(viennacl::traits::internal_size(vec1));
75 
77  size_vec2.start = cl_uint(viennacl::traits::start(vec2));
78  size_vec2.stride = cl_uint(viennacl::traits::stride(vec2));
79  size_vec2.size = cl_uint(viennacl::traits::size(vec2));
80  size_vec2.internal_size = cl_uint(viennacl::traits::internal_size(vec2));
81 
82 
83  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
84  size_vec1,
85 
86  viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(alpha)),
87  options_alpha,
88  viennacl::traits::opencl_handle(vec2),
89  size_vec2 )
90  );
91  }
92 
93 
94  template <typename T, typename ScalarType1, typename ScalarType2>
95  void avbv(vector_base<T> & vec1,
96  vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
97  vector_base<T> const & vec3, ScalarType2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
98  {
99  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
100  assert(viennacl::traits::opencl_handle(vec2).context() == viennacl::traits::opencl_handle(vec3).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
101 
102  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
104 
105  std::string kernel_name;
107  kernel_name = "avbv_cpu_cpu";
109  kernel_name = "avbv_cpu_gpu";
111  kernel_name = "avbv_gpu_cpu";
112  else
113  kernel_name = "avbv_gpu_gpu";
114 
115  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
116  cl_uint options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
117 
119  k.global_work_size(0, std::min<vcl_size_t>(128 * k.local_work_size(),
120  viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size(vec1), k.local_work_size()) ) );
121 
123  size_vec1.start = cl_uint(viennacl::traits::start(vec1));
124  size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
125  size_vec1.size = cl_uint(viennacl::traits::size(vec1));
126  size_vec1.internal_size = cl_uint(viennacl::traits::internal_size(vec1));
127 
129  size_vec2.start = cl_uint(viennacl::traits::start(vec2));
130  size_vec2.stride = cl_uint(viennacl::traits::stride(vec2));
131  size_vec2.size = cl_uint(viennacl::traits::size(vec2));
132  size_vec2.internal_size = cl_uint(viennacl::traits::internal_size(vec2));
133 
135  size_vec3.start = cl_uint(viennacl::traits::start(vec3));
136  size_vec3.stride = cl_uint(viennacl::traits::stride(vec3));
137  size_vec3.size = cl_uint(viennacl::traits::size(vec3));
138  size_vec3.internal_size = cl_uint(viennacl::traits::internal_size(vec3));
139 
140  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
141  size_vec1,
142 
143  viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(alpha)),
144  options_alpha,
145  viennacl::traits::opencl_handle(vec2),
146  size_vec2,
147 
148  viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(beta)),
149  options_beta,
150  viennacl::traits::opencl_handle(vec3),
151  size_vec3 )
152  );
153  }
154 
155 
156  template <typename T, typename ScalarType1, typename ScalarType2>
157  void avbv_v(vector_base<T> & vec1,
158  vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
159  vector_base<T> const & vec3, ScalarType2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
160  {
161  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
162  assert(viennacl::traits::opencl_handle(vec2).context() == viennacl::traits::opencl_handle(vec3).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
163 
164  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
166 
167  std::string kernel_name;
169  kernel_name = "avbv_v_cpu_cpu";
171  kernel_name = "avbv_v_cpu_gpu";
173  kernel_name = "avbv_v_gpu_cpu";
174  else
175  kernel_name = "avbv_v_gpu_gpu";
176 
177  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
178  cl_uint options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
179 
181  k.global_work_size(0, std::min<vcl_size_t>(128 * k.local_work_size(),
182  viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size(vec1), k.local_work_size()) ) );
183 
185  size_vec1.start = cl_uint(viennacl::traits::start(vec1));
186  size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
187  size_vec1.size = cl_uint(viennacl::traits::size(vec1));
188  size_vec1.internal_size = cl_uint(viennacl::traits::internal_size(vec1));
189 
191  size_vec2.start = cl_uint(viennacl::traits::start(vec2));
192  size_vec2.stride = cl_uint(viennacl::traits::stride(vec2));
193  size_vec2.size = cl_uint(viennacl::traits::size(vec2));
194  size_vec2.internal_size = cl_uint(viennacl::traits::internal_size(vec2));
195 
197  size_vec3.start = cl_uint(viennacl::traits::start(vec3));
198  size_vec3.stride = cl_uint(viennacl::traits::stride(vec3));
199  size_vec3.size = cl_uint(viennacl::traits::size(vec3));
200  size_vec3.internal_size = cl_uint(viennacl::traits::internal_size(vec3));
201 
202  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
203  size_vec1,
204 
205  viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(alpha)),
206  options_alpha,
207  viennacl::traits::opencl_handle(vec2),
208  size_vec2,
209 
210  viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(beta)),
211  options_beta,
212  viennacl::traits::opencl_handle(vec3),
213  size_vec3 )
214  );
215  }
216 
217 
224  template <typename T>
225  void vector_assign(vector_base<T> & vec1, const T & alpha, bool up_to_internal_size = false)
226  {
227  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
229 
231  k.global_work_size(0, std::min<vcl_size_t>(128 * k.local_work_size(),
232  viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size(vec1), k.local_work_size()) ) );
233 
234  cl_uint size = up_to_internal_size ? cl_uint(vec1.internal_size()) : cl_uint(viennacl::traits::size(vec1));
235  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
236  cl_uint(viennacl::traits::start(vec1)),
237  cl_uint(viennacl::traits::stride(vec1)),
238  size,
239  cl_uint(vec1.internal_size()), //Note: Do NOT use traits::internal_size() here, because vector proxies don't require padding.
240  viennacl::traits::opencl_handle(T(alpha)) )
241  );
242  }
243 
244 
250  template <typename T>
252  {
253  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
254 
255  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
257 
259 
260  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
261  cl_uint(viennacl::traits::start(vec1)),
262  cl_uint(viennacl::traits::stride(vec1)),
263  cl_uint(viennacl::traits::size(vec1)),
264  viennacl::traits::opencl_handle(vec2),
265  cl_uint(viennacl::traits::start(vec2)),
266  cl_uint(viennacl::traits::stride(vec2)),
267  cl_uint(viennacl::traits::size(vec2)))
268  );
269  }
270 
272 
278  template <typename T, typename OP>
281  {
282  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
283  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
284 
285  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
287 
289 
290  cl_uint op_type = 2; //0: product, 1: division, 2: power
292  op_type = 1;
294  op_type = 0;
295 
296  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
297  cl_uint(viennacl::traits::start(vec1)),
298  cl_uint(viennacl::traits::stride(vec1)),
299  cl_uint(viennacl::traits::size(vec1)),
300 
301  viennacl::traits::opencl_handle(proxy.lhs()),
302  cl_uint(viennacl::traits::start(proxy.lhs())),
303  cl_uint(viennacl::traits::stride(proxy.lhs())),
304 
305  viennacl::traits::opencl_handle(proxy.rhs()),
306  cl_uint(viennacl::traits::start(proxy.rhs())),
307  cl_uint(viennacl::traits::stride(proxy.rhs())),
308 
309  op_type)
310  );
311  }
312 
314 
320  template <typename T, typename OP>
323  {
324  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
325  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
326 
327  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
329 
331 
333  size_vec1.start = cl_uint(viennacl::traits::start(vec1));
334  size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
335  size_vec1.size = cl_uint(viennacl::traits::size(vec1));
336  size_vec1.internal_size = cl_uint(viennacl::traits::internal_size(vec1));
337 
339  size_vec2.start = cl_uint(viennacl::traits::start(proxy.lhs()));
340  size_vec2.stride = cl_uint(viennacl::traits::stride(proxy.lhs()));
341  size_vec2.size = cl_uint(viennacl::traits::size(proxy.lhs()));
342  size_vec2.internal_size = cl_uint(viennacl::traits::internal_size(proxy.lhs()));
343 
344  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
345  size_vec1,
346  viennacl::traits::opencl_handle(proxy.lhs()),
347  size_vec2)
348  );
349  }
350 
352 
359  template <typename T>
360  void inner_prod_impl(vector_base<T> const & vec1,
361  vector_base<T> const & vec2,
362  vector_base<T> & partial_result)
363  {
364  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
365  assert(viennacl::traits::opencl_handle(vec2).context() == viennacl::traits::opencl_handle(partial_result).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
366 
367  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
369 
370  assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
371  && bool("Incompatible vector sizes in inner_prod_impl()!"));
372 
374 
375  assert( (k.global_work_size() / k.local_work_size() <= partial_result.size()) && bool("Size mismatch for partial reduction in inner_prod_impl()") );
376 
378  size_vec1.start = cl_uint(viennacl::traits::start(vec1));
379  size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
380  size_vec1.size = cl_uint(viennacl::traits::size(vec1));
381  size_vec1.internal_size = cl_uint(viennacl::traits::internal_size(vec1));
382 
384  size_vec2.start = cl_uint(viennacl::traits::start(vec2));
385  size_vec2.stride = cl_uint(viennacl::traits::stride(vec2));
386  size_vec2.size = cl_uint(viennacl::traits::size(vec2));
387  size_vec2.internal_size = cl_uint(viennacl::traits::internal_size(vec2));
388 
389  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
390  size_vec1,
391  viennacl::traits::opencl_handle(vec2),
392  size_vec2,
394  viennacl::traits::opencl_handle(partial_result)
395  )
396  );
397  }
398 
399 
400  //implementation of inner product:
401  //namespace {
408  template <typename T>
409  void inner_prod_impl(vector_base<T> const & vec1,
410  vector_base<T> const & vec2,
411  scalar<T> & result)
412  {
413  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
414  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
415 
416  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
417 
418  vcl_size_t work_groups = 128;
419  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec1));
420  temp.resize(work_groups, ctx); // bring default-constructed vectors to the correct size:
421 
422  // Step 1: Compute partial inner products for each work group:
423  inner_prod_impl(vec1, vec2, temp);
424 
425  // Step 2: Sum partial results:
427 
428  ksum.local_work_size(0, work_groups);
429  ksum.global_work_size(0, work_groups);
430  viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
431  cl_uint(viennacl::traits::start(temp)),
432  cl_uint(viennacl::traits::stride(temp)),
433  cl_uint(viennacl::traits::size(temp)),
434  cl_uint(1),
436  viennacl::traits::opencl_handle(result) )
437  );
438  }
439 
440  namespace detail
441  {
442  template <typename ScalarT>
444  {
446  ret.start = cl_uint(viennacl::traits::start(vec));
447  ret.stride = cl_uint(viennacl::traits::stride(vec));
448  ret.size = cl_uint(viennacl::traits::size(vec));
450  return ret;
451  }
452  }
453 
460  template <typename T>
462  vector_tuple<T> const & vec_tuple,
463  vector_base<T> & result)
464  {
465  assert(viennacl::traits::opencl_handle(x).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
466 
467  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(x).context());
470 
471  vcl_size_t work_groups = 128;
472 
473  viennacl::vector<T> temp(work_groups, viennacl::traits::context(x));
474  temp.resize(8 * work_groups, ctx); // bring default-constructed vectors to the correct size:
475 
477 
484 
485  vcl_size_t current_index = 0;
486  while (current_index < vec_tuple.const_size())
487  {
488  switch (vec_tuple.const_size() - current_index)
489  {
490  case 7:
491  case 6:
492  case 5:
493  case 4:
494  {
495  vector_base<T> const & y0 = vec_tuple.const_at(current_index );
496  vector_base<T> const & y1 = vec_tuple.const_at(current_index + 1);
497  vector_base<T> const & y2 = vec_tuple.const_at(current_index + 2);
498  vector_base<T> const & y3 = vec_tuple.const_at(current_index + 3);
499  viennacl::ocl::enqueue(inner_prod_kernel_4( viennacl::traits::opencl_handle(x), layout_x,
500  viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
501  viennacl::traits::opencl_handle(y1), detail::make_layout(y1),
502  viennacl::traits::opencl_handle(y2), detail::make_layout(y2),
503  viennacl::traits::opencl_handle(y3), detail::make_layout(y3),
504  viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * 4 * inner_prod_kernel_4.local_work_size()),
505  viennacl::traits::opencl_handle(temp)
506  ) );
507 
508  ksum.local_work_size(0, work_groups);
509  ksum.global_work_size(0, 4 * work_groups);
510  viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
512  viennacl::traits::opencl_handle(result),
513  cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
514  cl_uint(viennacl::traits::stride(result))
515  )
516  );
517  }
518  current_index += 4;
519  break;
520 
521  case 3:
522  {
523  vector_base<T> const & y0 = vec_tuple.const_at(current_index );
524  vector_base<T> const & y1 = vec_tuple.const_at(current_index + 1);
525  vector_base<T> const & y2 = vec_tuple.const_at(current_index + 2);
526  viennacl::ocl::enqueue(inner_prod_kernel_3( viennacl::traits::opencl_handle(x), layout_x,
527  viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
528  viennacl::traits::opencl_handle(y1), detail::make_layout(y1),
529  viennacl::traits::opencl_handle(y2), detail::make_layout(y2),
530  viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * 3 * inner_prod_kernel_3.local_work_size()),
531  viennacl::traits::opencl_handle(temp)
532  ) );
533 
534  ksum.local_work_size(0, work_groups);
535  ksum.global_work_size(0, 3 * work_groups);
536  viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
538  viennacl::traits::opencl_handle(result),
539  cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
540  cl_uint(viennacl::traits::stride(result))
541  )
542  );
543  }
544  current_index += 3;
545  break;
546 
547  case 2:
548  {
549  vector_base<T> const & y0 = vec_tuple.const_at(current_index );
550  vector_base<T> const & y1 = vec_tuple.const_at(current_index + 1);
551  viennacl::ocl::enqueue(inner_prod_kernel_2( viennacl::traits::opencl_handle(x), layout_x,
552  viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
553  viennacl::traits::opencl_handle(y1), detail::make_layout(y1),
554  viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * 2 * inner_prod_kernel_2.local_work_size()),
555  viennacl::traits::opencl_handle(temp)
556  ) );
557 
558  ksum.local_work_size(0, work_groups);
559  ksum.global_work_size(0, 2 * work_groups);
560  viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
562  viennacl::traits::opencl_handle(result),
563  cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
564  cl_uint(viennacl::traits::stride(result))
565  )
566  );
567  }
568  current_index += 2;
569  break;
570 
571  case 1:
572  {
573  vector_base<T> const & y0 = vec_tuple.const_at(current_index );
574  viennacl::ocl::enqueue(inner_prod_kernel_1( viennacl::traits::opencl_handle(x), layout_x,
575  viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
576  viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * 1 * inner_prod_kernel_1.local_work_size()),
577  viennacl::traits::opencl_handle(temp)
578  ) );
579 
580  ksum.local_work_size(0, work_groups);
581  ksum.global_work_size(0, 1 * work_groups);
582  viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
584  viennacl::traits::opencl_handle(result),
585  cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
586  cl_uint(viennacl::traits::stride(result))
587  )
588  );
589  }
590  current_index += 1;
591  break;
592 
593  default: //8 or more vectors
594  {
595  vector_base<T> const & y0 = vec_tuple.const_at(current_index );
596  vector_base<T> const & y1 = vec_tuple.const_at(current_index + 1);
597  vector_base<T> const & y2 = vec_tuple.const_at(current_index + 2);
598  vector_base<T> const & y3 = vec_tuple.const_at(current_index + 3);
599  vector_base<T> const & y4 = vec_tuple.const_at(current_index + 4);
600  vector_base<T> const & y5 = vec_tuple.const_at(current_index + 5);
601  vector_base<T> const & y6 = vec_tuple.const_at(current_index + 6);
602  vector_base<T> const & y7 = vec_tuple.const_at(current_index + 7);
603  viennacl::ocl::enqueue(inner_prod_kernel_8( viennacl::traits::opencl_handle(x), layout_x,
604  viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
605  viennacl::traits::opencl_handle(y1), detail::make_layout(y1),
606  viennacl::traits::opencl_handle(y2), detail::make_layout(y2),
607  viennacl::traits::opencl_handle(y3), detail::make_layout(y3),
608  viennacl::traits::opencl_handle(y4), detail::make_layout(y4),
609  viennacl::traits::opencl_handle(y5), detail::make_layout(y5),
610  viennacl::traits::opencl_handle(y6), detail::make_layout(y6),
611  viennacl::traits::opencl_handle(y7), detail::make_layout(y7),
612  viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * 8 * inner_prod_kernel_8.local_work_size()),
613  viennacl::traits::opencl_handle(temp)
614  ) );
615 
616  ksum.local_work_size(0, work_groups);
617  ksum.global_work_size(0, 8 * work_groups);
618  viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
620  viennacl::traits::opencl_handle(result),
621  cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
622  cl_uint(viennacl::traits::stride(result))
623  )
624  );
625  }
626  current_index += 8;
627  break;
628  }
629  }
630 
631  }
632 
633 
634  //implementation of inner product:
635  //namespace {
642  template <typename T>
643  void inner_prod_cpu(vector_base<T> const & vec1,
644  vector_base<T> const & vec2,
645  T & result)
646  {
647  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
648 
649  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
650 
651  vcl_size_t work_groups = 128;
652  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec1));
653  temp.resize(work_groups, ctx); // bring default-constructed vectors to the correct size:
654 
655  // Step 1: Compute partial inner products for each work group:
656  inner_prod_impl(vec1, vec2, temp);
657 
658  // Step 2: Sum partial results:
659 
660  // Now copy partial results from GPU back to CPU and run reduction there:
661  std::vector<T> temp_cpu(work_groups);
662  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
663 
664  result = 0;
665  for (typename std::vector<T>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
666  result += *it;
667  }
668 
669 
671 
678  template <typename T>
680  vector_base<T> & partial_result,
681  cl_uint norm_id)
682  {
683  assert(viennacl::traits::opencl_handle(vec).context() == viennacl::traits::opencl_handle(partial_result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
684 
685  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
687 
689 
690  assert( (k.global_work_size() / k.local_work_size() <= partial_result.size()) && bool("Size mismatch for partial reduction in norm_reduction_impl()") );
691 
692  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec),
693  cl_uint(viennacl::traits::start(vec)),
694  cl_uint(viennacl::traits::stride(vec)),
695  cl_uint(viennacl::traits::size(vec)),
696  cl_uint(norm_id),
698  viennacl::traits::opencl_handle(partial_result) )
699  );
700  }
701 
702 
704 
710  template <typename T>
711  void norm_1_impl(vector_base<T> const & vec,
712  scalar<T> & result)
713  {
714  assert(viennacl::traits::opencl_handle(vec).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
715 
716  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
717 
718  vcl_size_t work_groups = 128;
719  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
720 
721  // Step 1: Compute the partial work group results
722  norm_reduction_impl(vec, temp, 1);
723 
724  // Step 2: Compute the partial reduction using OpenCL
726 
727  ksum.local_work_size(0, work_groups);
728  ksum.global_work_size(0, work_groups);
729  viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
730  cl_uint(viennacl::traits::start(temp)),
731  cl_uint(viennacl::traits::stride(temp)),
732  cl_uint(viennacl::traits::size(temp)),
733  cl_uint(1),
735  result)
736  );
737  }
738 
744  template <typename T>
745  void norm_1_cpu(vector_base<T> const & vec,
746  T & result)
747  {
748  vcl_size_t work_groups = 128;
749  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
750 
751  // Step 1: Compute the partial work group results
752  norm_reduction_impl(vec, temp, 1);
753 
754  // Step 2: Now copy partial results from GPU back to CPU and run reduction there:
755  typedef std::vector<typename viennacl::result_of::cl_type<T>::type> CPUVectorType;
756 
757  CPUVectorType temp_cpu(work_groups);
758  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
759 
760  result = 0;
761  for (typename CPUVectorType::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
762  result += static_cast<T>(*it);
763  }
764 
765 
766 
768 
769 
775  template <typename T>
776  void norm_2_impl(vector_base<T> const & vec,
777  scalar<T> & result)
778  {
779  assert(viennacl::traits::opencl_handle(vec).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
780 
781  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
782 
783  vcl_size_t work_groups = 128;
784  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
785 
786  // Step 1: Compute the partial work group results
787  norm_reduction_impl(vec, temp, 2);
788 
789  // Step 2: Reduction via OpenCL
791 
792  ksum.local_work_size(0, work_groups);
793  ksum.global_work_size(0, work_groups);
794  viennacl::ocl::enqueue( ksum(viennacl::traits::opencl_handle(temp),
795  cl_uint(viennacl::traits::start(temp)),
796  cl_uint(viennacl::traits::stride(temp)),
797  cl_uint(viennacl::traits::size(temp)),
798  cl_uint(2),
800  result)
801  );
802  }
803 
809  template <typename T>
810  void norm_2_cpu(vector_base<T> const & vec,
811  T & result)
812  {
813  vcl_size_t work_groups = 128;
814  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
815 
816  // Step 1: Compute the partial work group results
817  norm_reduction_impl(vec, temp, 2);
818 
819  // Step 2: Now copy partial results from GPU back to CPU and run reduction there:
820  typedef std::vector<typename viennacl::result_of::cl_type<T>::type> CPUVectorType;
821 
822  CPUVectorType temp_cpu(work_groups);
823  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
824 
825  result = 0;
826  for (typename CPUVectorType::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
827  result += static_cast<T>(*it);
828  result = std::sqrt(result);
829  }
830 
831 
832 
834 
840  template <typename T>
841  void norm_inf_impl(vector_base<T> const & vec,
842  scalar<T> & result)
843  {
844  assert(viennacl::traits::opencl_handle(vec).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
845 
846  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
847 
848  vcl_size_t work_groups = 128;
849  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
850 
851  // Step 1: Compute the partial work group results
852  norm_reduction_impl(vec, temp, 0);
853 
854  //part 2: parallel reduction of reduced kernel:
856  ksum.local_work_size(0, work_groups);
857  ksum.global_work_size(0, work_groups);
858 
859  viennacl::ocl::enqueue( ksum(viennacl::traits::opencl_handle(temp),
860  cl_uint(viennacl::traits::start(temp)),
861  cl_uint(viennacl::traits::stride(temp)),
862  cl_uint(viennacl::traits::size(temp)),
863  cl_uint(0),
865  result)
866  );
867  }
868 
874  template <typename T>
875  void norm_inf_cpu(vector_base<T> const & vec,
876  T & result)
877  {
878  vcl_size_t work_groups = 128;
879  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
880 
881  // Step 1: Compute the partial work group results
882  norm_reduction_impl(vec, temp, 0);
883 
884  // Step 2: Now copy partial results from GPU back to CPU and run reduction there:
885  typedef std::vector<typename viennacl::result_of::cl_type<T>::type> CPUVectorType;
886 
887  CPUVectorType temp_cpu(work_groups);
888  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
889 
890  result = 0;
891  for (typename CPUVectorType::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
892  result = std::max(result, static_cast<T>(*it));
893  }
894 
895 
897 
898  //This function should return a CPU scalar, otherwise statements like
899  // vcl_rhs[index_norm_inf(vcl_rhs)]
900  // are ambiguous
906  template <typename T>
907  cl_uint index_norm_inf(vector_base<T> const & vec)
908  {
909  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
911 
912  viennacl::ocl::handle<cl_mem> h = ctx.create_memory(CL_MEM_READ_WRITE, sizeof(cl_uint));
913 
915  //cl_uint size = static_cast<cl_uint>(vcl_vec.internal_size());
916 
917  //TODO: Use multi-group kernel for large vector sizes
918 
920  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec),
921  cl_uint(viennacl::traits::start(vec)),
922  cl_uint(viennacl::traits::stride(vec)),
923  cl_uint(viennacl::traits::size(vec)),
925  viennacl::ocl::local_mem(sizeof(cl_uint) * k.local_work_size()), h));
926 
927  //read value:
928  cl_uint result;
929  cl_int err = clEnqueueReadBuffer(ctx.get_queue().handle().get(), h.get(), CL_TRUE, 0, sizeof(cl_uint), &result, 0, NULL, NULL);
930  VIENNACL_ERR_CHECK(err);
931  return result;
932  }
933 
934  //TODO: Special case vec1 == vec2 allows improvement!!
944  template <typename T>
946  vector_base<T> & vec2,
947  T alpha, T beta)
948  {
949  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
950 
951  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
953 
954  assert(viennacl::traits::size(vec1) == viennacl::traits::size(vec2));
956 
957  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
958  cl_uint(viennacl::traits::start(vec1)),
959  cl_uint(viennacl::traits::stride(vec1)),
960  cl_uint(viennacl::traits::size(vec1)),
961  viennacl::traits::opencl_handle(vec2),
962  cl_uint(viennacl::traits::start(vec2)),
963  cl_uint(viennacl::traits::stride(vec2)),
964  cl_uint(viennacl::traits::size(vec2)),
965  viennacl::traits::opencl_handle(alpha),
966  viennacl::traits::opencl_handle(beta))
967  );
968  }
969 
970  } //namespace opencl
971  } //namespace linalg
972 } //namespace viennacl
973 
974 
975 #endif
viennacl::ocl::kernel & get_kernel(std::string const &program_name, std::string const &kernel_name)
Convenience function for retrieving the kernel of a program directly from the context.
Definition: context.hpp:470
std::size_t vcl_size_t
Definition: forwards.h:58
Helper class for packing four cl_uint numbers into a uint4 type for access inside an OpenCL kernel...
Definition: kernel.hpp:46
void avbv(vector_base< T > &vec1, vector_base< T > const &vec2, ScalarType1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, vector_base< T > const &vec3, ScalarType2 const &beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
Definition: vector_operations.hpp:95
void norm_2_impl(vector_base< T > const &vec, scalar< T > &result)
Computes the l^2-norm of a vector - implementation using OpenCL summation at second step...
Definition: vector_operations.hpp:776
Represents an OpenCL device within ViennaCL.
Common implementations shared by OpenCL-based operations.
void norm_1_cpu(vector_base< T > const &vec, T &result)
Computes the l^1-norm of a vector with final reduction on CPU.
Definition: vector_operations.hpp:745
Generic size and resize functionality for different vector and matrix types.
void plane_rotation(vector_base< T > &vec1, vector_base< T > &vec2, T alpha, T beta)
Computes a plane rotation of two vectors.
Definition: vector_operations.hpp:945
Represents an OpenCL kernel within ViennaCL.
Definition: kernel.hpp:59
iterator end()
Returns an iterator pointing to the end of the vector (STL like)
Definition: vector.hpp:809
Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc...
This class represents a single scalar value on the GPU and behaves mostly like a built-in scalar type...
Definition: forwards.h:172
Various little tools used here and there in ViennaCL.
viennacl::ocl::packed_cl_uint make_layout(vector_base< ScalarT > const &vec)
Definition: vector_operations.hpp:443
void norm_reduction_impl(vector_base< T > const &vec, vector_base< T > &partial_result, cl_uint norm_id)
Computes the partial work group results for vector norms.
Definition: vector_operations.hpp:679
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:51
Main kernel class for generating OpenCL kernels for multiple inner products on/with viennacl::vector<...
Definition: vector.hpp:646
void norm_inf_impl(vector_base< T > const &vec, scalar< T > &result)
Computes the supremum-norm of a vector.
Definition: vector_operations.hpp:841
cl_uint start
Starting value of the integer stride.
Definition: kernel.hpp:49
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:46
Common base class for dense vectors, vector ranges, and vector slices.
Definition: forwards.h:205
This file provides the forward declarations for the main types used within ViennaCL.
Determines row and column increments for matrices and matrix proxies.
static void init(viennacl::ocl::context &ctx)
Definition: vector_element.hpp:99
viennacl::ocl::handle< cl_command_queue > const & handle() const
Definition: command_queue.hpp:81
vcl_size_t internal_size(vector_base< NumericT > const &vec)
Helper routine for obtaining the buffer length of a ViennaCL vector.
Definition: size.hpp:268
Tuple class holding pointers to multiple vectors. Mainly used as a temporary object returned from vie...
Definition: forwards.h:211
void avbv_v(vector_base< T > &vec1, vector_base< T > const &vec2, ScalarType1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, vector_base< T > const &vec3, ScalarType2 const &beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
Definition: vector_operations.hpp:157
const OCL_TYPE & get() const
Definition: handle.hpp:189
void resize(size_type new_size, bool preserve=true)
Resizes the allocated memory for the vector. Pads the memory to be a multiple of 'ALIGNMENT'.
Definition: vector.hpp:1074
#define VIENNACL_ERR_CHECK(err)
Definition: error.hpp:655
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:29
size_type internal_size() const
Returns the internal length of the vector, which is given by size() plus the extra memory due to padd...
Definition: vector.hpp:841
void inner_prod_impl(vector_base< T > const &vec1, vector_base< T > const &vec2, vector_base< T > &partial_result)
Computes the partial inner product of two vectors - implementation. Library users should call inner_p...
Definition: vector_operations.hpp:360
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:144
viennacl::ocl::command_queue & get_queue()
Definition: context.hpp:249
void enqueue(KernelType &k, viennacl::ocl::command_queue const &queue)
Enqueues a kernel in the provided queue.
Definition: enqueue.hpp:48
A class representing local (shared) OpenCL memory. Typically used as kernel argument.
Definition: local_mem.hpp:33
viennacl::ocl::handle< cl_mem > create_memory(cl_mem_flags flags, unsigned int size, void *ptr=NULL) const
Creates a memory buffer within the context.
Definition: context.hpp:199
Helper struct for checking whether a type is a host scalar type (e.g. float, double) ...
Definition: forwards.h:363
void norm_2_cpu(vector_base< T > const &vec, T &result)
Computes the l^1-norm of a vector with final reduction on CPU.
Definition: vector_operations.hpp:810
OpenCL kernel file for vector operations.
void element_op(matrix_base< T, F > &A, matrix_expression< const matrix_base< T, F >, const matrix_base< T, F >, op_element_binary< OP > > const &proxy)
Implementation of binary element-wise operations A = OP(B,C)
Definition: matrix_operations.hpp:460
Implementation of a smart-pointer-like class for handling OpenCL handles.
result_of::size_type< T >::type start(T const &obj)
Definition: start.hpp:43
void av(vector_base< T > &vec1, vector_base< T > const &vec2, ScalarType1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
Definition: vector_operations.hpp:55
cl_uint make_options(vcl_size_t length, bool reciprocal, bool flip_sign)
Definition: common.hpp:39
void norm_1_impl(vector_base< T > const &vec, scalar< T > &result)
Computes the l^1-norm of a vector.
Definition: vector_operations.hpp:711
An expression template class that represents a binary operation that yields a vector.
Definition: forwards.h:181
iterator begin()
Returns an iterator pointing to the beginning of the vector (STL like)
Definition: vector.hpp:803
VectorType const & const_at(vcl_size_t i) const
Definition: vector.hpp:1174
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector.hpp:837
Helper metafunction for checking whether the provided type is viennacl::op_div (for division) ...
Definition: predicate.hpp:448
Main kernel class for generating OpenCL kernels for elementwise operations other than addition and su...
Definition: vector_element.hpp:92
T type
Definition: result_of.hpp:590
All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.
A vector class representing a linear memory sequence on the GPU. Inspired by boost::numeric::ublas::v...
Definition: forwards.h:208
viennacl::context context(T const &t)
Returns an ID for the currently active memory domain of an object.
Definition: context.hpp:41
Representation of an OpenCL kernel in ViennaCL.
cl_uint index_norm_inf(vector_base< T > const &vec)
Computes the index of the first entry that is equal to the supremum-norm in modulus.
Definition: vector_operations.hpp:907
cl_uint stride
Increment between integers.
Definition: kernel.hpp:51
void norm_inf_cpu(vector_base< T > const &vec, T &result)
Computes the supremum-norm of a vector.
Definition: vector_operations.hpp:875
vcl_size_t const_size() const
Definition: vector.hpp:1171
A tag class representing element-wise binary operations (like multiplication) on vectors or matrices...
Definition: forwards.h:86
OpenCL kernel file for element-wise vector operations.
size_type global_work_size(int index=0) const
Returns the global work size at the respective dimension.
Definition: kernel.hpp:759
Extracts the underlying OpenCL handle from a vector, a matrix, an expression etc. ...
void vector_assign(vector_base< T > &vec1, const T &alpha, bool up_to_internal_size=false)
Assign a constant value to a vector (-range/-slice)
Definition: vector_operations.hpp:225
Helper metafunction for checking whether the provided type is viennacl::op_prod (for products/multipl...
Definition: predicate.hpp:418
static void init(viennacl::ocl::context &ctx)
Definition: vector.hpp:607
size_type local_work_size(int index=0) const
Returns the local work size at the respective dimension.
Definition: kernel.hpp:750
std::string op_to_string(op_abs)
Definition: common.hpp:71
cl_uint size
Number of values in the stride.
Definition: kernel.hpp:53
void vector_swap(vector_base< T > &vec1, vector_base< T > &vec2)
Swaps the contents of two vectors, data is copied.
Definition: vector_operations.hpp:251
A tag class representing element-wise unary operations (like sin()) on vectors or matrices...
Definition: forwards.h:90
void inner_prod_cpu(vector_base< T > const &vec1, vector_base< T > const &vec2, T &result)
Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1...
Definition: vector_operations.hpp:643
Implementation of the ViennaCL scalar class.
void fast_copy(const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_begin, const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_end, CPU_ITERATOR cpu_begin)
STL-like transfer of a GPU vector to the CPU. The cpu type is assumed to reside in a linear piece of ...
Definition: vector.hpp:1262
static void init(viennacl::ocl::context &ctx)
Definition: vector.hpp:653
Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without inv...
Definition: vector.hpp:600
Simple enable-if variant that uses the SFINAE pattern.
cl_uint internal_size
Internal length of the buffer. Might be larger than 'size' due to padding.
Definition: kernel.hpp:55