1 #ifndef VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_
57 unsigned int options2,
63 if (options2 & (1 << 0))
66 if (options2 & (1 << 1))
68 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
70 i += gridDim.x * blockDim.x)
75 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
77 i += gridDim.x * blockDim.x)
90 unsigned int options2,
96 if (options2 & (1 << 0))
99 if (options2 & (1 << 1))
101 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
103 i += gridDim.x * blockDim.x)
108 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
110 i += gridDim.x * blockDim.x)
117 template <
typename T,
typename ScalarType1>
119 vector_base<T> const & vec2, ScalarType1
const & alpha,
vcl_size_t len_alpha,
bool reciprocal_alpha,
bool flip_sign_alpha)
121 typedef T value_type;
123 unsigned int options_alpha =
detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
125 value_type data_alpha = alpha;
127 data_alpha = -data_alpha;
128 if (reciprocal_alpha)
129 data_alpha =
static_cast<value_type
>(1) / data_alpha;
131 value_type temporary_alpha = 0;
133 temporary_alpha = alpha;
135 av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
142 detail::cuda_arg<value_type>(vec2),
152 template <
typename T>
159 unsigned int options2,
165 unsigned int options3,
171 if (options2 & (1 << 0))
175 if (options3 & (1 << 0))
178 if (options2 & (1 << 1))
180 if (options3 & (1 << 1))
182 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
184 i += gridDim.x * blockDim.x)
185 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
189 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
191 i += gridDim.x * blockDim.x)
192 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
197 if (options3 & (1 << 1))
199 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
201 i += gridDim.x * blockDim.x)
202 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
206 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
208 i += gridDim.x * blockDim.x)
209 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
215 template <
typename T>
222 unsigned int options2,
228 unsigned int options3,
234 if (options2 & (1 << 0))
238 if (options3 & (1 << 0))
241 if (options2 & (1 << 1))
243 if (options3 & (1 << 1))
245 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
247 i += gridDim.x * blockDim.x)
248 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
252 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
254 i += gridDim.x * blockDim.x)
255 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
260 if (options3 & (1 << 1))
262 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
264 i += gridDim.x * blockDim.x)
265 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
269 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
271 i += gridDim.x * blockDim.x)
272 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
278 template <
typename T>
285 unsigned int options2,
291 unsigned int options3,
297 if (options2 & (1 << 0))
301 if (options3 & (1 << 0))
304 if (options2 & (1 << 1))
306 if (options3 & (1 << 1))
308 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
310 i += gridDim.x * blockDim.x)
311 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
315 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
317 i += gridDim.x * blockDim.x)
318 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
323 if (options3 & (1 << 1))
325 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
327 i += gridDim.x * blockDim.x)
328 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
332 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
334 i += gridDim.x * blockDim.x)
335 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
341 template <
typename T>
348 unsigned int options2,
354 unsigned int options3,
360 if (options2 & (1 << 0))
364 if (options3 & (1 << 0))
367 if (options2 & (1 << 1))
369 if (options3 & (1 << 1))
371 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
373 i += gridDim.x * blockDim.x)
374 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
378 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
380 i += gridDim.x * blockDim.x)
381 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
386 if (options3 & (1 << 1))
388 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
390 i += gridDim.x * blockDim.x)
391 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
395 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
397 i += gridDim.x * blockDim.x)
398 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
406 template <
typename T,
typename ScalarType1,
typename ScalarType2>
408 vector_base<T> const & vec2, ScalarType1
const & alpha,
vcl_size_t len_alpha,
bool reciprocal_alpha,
bool flip_sign_alpha,
409 vector_base<T> const & vec3, ScalarType2
const & beta,
vcl_size_t len_beta,
bool reciprocal_beta,
bool flip_sign_beta)
411 typedef T value_type;
413 unsigned int options_alpha =
detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
415 value_type data_alpha = alpha;
417 data_alpha = -data_alpha;
418 if (reciprocal_alpha)
419 data_alpha =
static_cast<value_type
>(1) / data_alpha;
421 value_type temporary_alpha = 0;
423 temporary_alpha = alpha;
427 value_type temporary_beta = 0;
429 temporary_beta = beta;
432 avbv_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
439 detail::cuda_arg<value_type>(vec2),
445 detail::cuda_arg<value_type>(vec3),
456 template <
typename T>
463 unsigned int options2,
469 unsigned int options3,
475 if (options2 & (1 << 0))
479 if (options3 & (1 << 0))
482 if (options2 & (1 << 1))
484 if (options3 & (1 << 1))
486 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
488 i += gridDim.x * blockDim.x)
489 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
493 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
495 i += gridDim.x * blockDim.x)
496 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
501 if (options3 & (1 << 1))
503 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
505 i += gridDim.x * blockDim.x)
506 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
510 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
512 i += gridDim.x * blockDim.x)
513 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
519 template <
typename T>
526 unsigned int options2,
532 unsigned int options3,
538 if (options2 & (1 << 0))
542 if (options3 & (1 << 0))
545 if (options2 & (1 << 1))
547 if (options3 & (1 << 1))
549 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
551 i += gridDim.x * blockDim.x)
552 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
556 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
558 i += gridDim.x * blockDim.x)
559 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
564 if (options3 & (1 << 1))
566 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
568 i += gridDim.x * blockDim.x)
569 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
573 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
575 i += gridDim.x * blockDim.x)
576 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
582 template <
typename T>
589 unsigned int options2,
595 unsigned int options3,
601 if (options2 & (1 << 0))
605 if (options3 & (1 << 0))
608 if (options2 & (1 << 1))
610 if (options3 & (1 << 1))
612 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
614 i += gridDim.x * blockDim.x)
615 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
619 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
621 i += gridDim.x * blockDim.x)
622 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
627 if (options3 & (1 << 1))
629 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
631 i += gridDim.x * blockDim.x)
632 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
636 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
638 i += gridDim.x * blockDim.x)
639 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
645 template <
typename T>
652 unsigned int options2,
658 unsigned int options3,
664 if (options2 & (1 << 0))
668 if (options3 & (1 << 0))
671 if (options2 & (1 << 1))
673 if (options3 & (1 << 1))
675 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
677 i += gridDim.x * blockDim.x)
678 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
682 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
684 i += gridDim.x * blockDim.x)
685 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
690 if (options3 & (1 << 1))
692 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
694 i += gridDim.x * blockDim.x)
695 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
699 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
701 i += gridDim.x * blockDim.x)
702 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
708 template <
typename T,
typename ScalarType1,
typename ScalarType2>
710 vector_base<T> const & vec2, ScalarType1
const & alpha,
vcl_size_t len_alpha,
bool reciprocal_alpha,
bool flip_sign_alpha,
711 vector_base<T> const & vec3, ScalarType2
const & beta,
vcl_size_t len_beta,
bool reciprocal_beta,
bool flip_sign_beta)
713 typedef T value_type;
715 unsigned int options_alpha =
detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
717 value_type data_alpha = alpha;
719 data_alpha = -data_alpha;
720 if (reciprocal_alpha)
721 data_alpha =
static_cast<value_type
>(1) / data_alpha;
723 value_type temporary_alpha = 0;
725 temporary_alpha = alpha;
729 value_type temporary_beta = 0;
731 temporary_beta = beta;
734 avbv_v_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
741 detail::cuda_arg<value_type>(vec2),
747 detail::cuda_arg<value_type>(vec3),
755 template <
typename T>
764 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
766 i += gridDim.x * blockDim.x)
776 template <
typename T,
typename S1>
779 typedef T value_type;
781 value_type temporary_alpha = 0;
783 temporary_alpha = alpha;
787 vector_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
799 template <
typename T>
810 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
812 i += gridDim.x * blockDim.x)
814 tmp = vec2[i*inc2+
start2];
816 vec1[i*inc1+
start1] = tmp;
826 template <
typename T>
829 typedef T value_type;
831 vector_swap_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
836 detail::cuda_arg<value_type>(vec2),
844 template <
typename T>
863 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
865 i += gridDim.x * blockDim.x)
867 vec1[i*inc1+
start1] = pow(vec2[i*inc2+start2], vec3[i*inc3+start3]);
870 else if (op_type == 1)
872 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
874 i += gridDim.x * blockDim.x)
876 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / vec3[i*inc3+start3];
879 else if (op_type == 0)
881 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
883 i += gridDim.x * blockDim.x)
885 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * vec3[i*inc3+start3];
890 template <
typename T>
909 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
911 i += gridDim.x * blockDim.x)
913 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / vec3[i*inc3+start3];
916 else if (op_type == 0)
918 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
920 i += gridDim.x * blockDim.x)
922 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * vec3[i*inc3+start3];
932 template <
typename T,
typename OP>
936 typedef T value_type;
938 unsigned int op_type = 2;
944 element_op_int_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
949 detail::cuda_arg<value_type>(proxy.lhs()),
953 detail::cuda_arg<value_type>(proxy.rhs()),
962 template <
typename OP>
966 typedef float value_type;
968 unsigned int op_type = 2;
974 element_op_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
979 detail::cuda_arg<value_type>(proxy.lhs()),
983 detail::cuda_arg<value_type>(proxy.rhs()),
992 template <
typename OP>
996 typedef double value_type;
998 unsigned int op_type = 2;
1004 element_op_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1009 detail::cuda_arg<value_type>(proxy.lhs()),
1013 detail::cuda_arg<value_type>(proxy.rhs()),
1029 T * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1030 T
const * vec2,
unsigned int start2,
unsigned int inc2)
1032 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1033 vec1[i*inc1+
start1] = acos(vec2[i*inc2+start2]);
1036 template <
typename T>
1040 typedef T value_type;
1042 vec_element_acos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1046 detail::cuda_arg<value_type>(proxy.lhs()),
1055 T * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1056 T
const * vec2,
unsigned int start2,
unsigned int inc2)
1058 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1059 vec1[i*inc1+
start1] = asin(vec2[i*inc2+start2]);
1062 template <
typename T>
1066 typedef T value_type;
1068 vec_element_asin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1072 detail::cuda_arg<value_type>(proxy.lhs()),
1082 T * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1083 T
const * vec2,
unsigned int start2,
unsigned int inc2)
1085 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1086 vec1[i*inc1+
start1] = atan(vec2[i*inc2+start2]);
1089 template <
typename T>
1093 typedef T value_type;
1095 vec_element_atan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1099 detail::cuda_arg<value_type>(proxy.lhs()),
1109 T * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1110 T
const * vec2,
unsigned int start2,
unsigned int inc2)
1112 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1113 vec1[i*inc1+
start1] = ceil(vec2[i*inc2+start2]);
1116 template <
typename T>
1120 typedef T value_type;
1122 vec_element_ceil_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1126 detail::cuda_arg<value_type>(proxy.lhs()),
1136 T * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1137 T
const * vec2,
unsigned int start2,
unsigned int inc2)
1139 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1140 vec1[i*inc1+
start1] = cos(vec2[i*inc2+start2]);
1143 template <
typename T>
1147 typedef T value_type;
1149 vec_element_cos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1153 detail::cuda_arg<value_type>(proxy.lhs()),
1163 T * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1164 T
const * vec2,
unsigned int start2,
unsigned int inc2)
1166 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1167 vec1[i*inc1+
start1] = cosh(vec2[i*inc2+start2]);
1170 template <
typename T>
1174 typedef T value_type;
1176 vec_element_cosh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1180 detail::cuda_arg<value_type>(proxy.lhs()),
1190 T * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1191 T
const * vec2,
unsigned int start2,
unsigned int inc2)
1193 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1194 vec1[i*inc1+
start1] = exp(vec2[i*inc2+start2]);
1197 template <
typename T>
1201 typedef T value_type;
1203 vec_element_exp_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1207 detail::cuda_arg<value_type>(proxy.lhs()),
1217 T * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1218 T
const * vec2,
unsigned int start2,
unsigned int inc2)
1220 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1221 vec1[i*inc1+
start1] = fabs(vec2[i*inc2+start2]);
1224 template <
typename T>
1228 typedef T value_type;
1230 vec_element_fabs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1234 detail::cuda_arg<value_type>(proxy.lhs()),
1243 T * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1244 T
const * vec2,
unsigned int start2,
unsigned int inc2)
1246 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1247 vec1[i*inc1+
start1] = abs(vec2[i*inc2+start2]);
1250 template <
typename T>
1254 typedef T value_type;
1256 vec_element_abs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1260 detail::cuda_arg<value_type>(proxy.lhs()),
1271 T * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1272 T
const * vec2,
unsigned int start2,
unsigned int inc2)
1274 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1275 vec1[i*inc1+
start1] = floor(vec2[i*inc2+start2]);
1278 template <
typename T>
1282 typedef T value_type;
1284 vec_element_floor_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1288 detail::cuda_arg<value_type>(proxy.lhs()),
1298 T * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1299 T
const * vec2,
unsigned int start2,
unsigned int inc2)
1301 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1302 vec1[i*inc1+
start1] = log(vec2[i*inc2+start2]);
1305 template <
typename T>
1309 typedef T value_type;
1311 vec_element_log_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1315 detail::cuda_arg<value_type>(proxy.lhs()),
1325 T * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1326 T
const * vec2,
unsigned int start2,
unsigned int inc2)
1328 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1329 vec1[i*inc1+
start1] = log10(vec2[i*inc2+start2]);
1332 template <
typename T>
1336 typedef T value_type;
1338 vec_element_log10_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1342 detail::cuda_arg<value_type>(proxy.lhs()),
1352 T * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1353 T
const * vec2,
unsigned int start2,
unsigned int inc2)
1355 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1356 vec1[i*inc1+
start1] = sin(vec2[i*inc2+start2]);
1359 template <
typename T>
1363 typedef T value_type;
1365 vec_element_sin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1369 detail::cuda_arg<value_type>(proxy.lhs()),
1379 T * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1380 T
const * vec2,
unsigned int start2,
unsigned int inc2)
1382 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1383 vec1[i*inc1+
start1] = sinh(vec2[i*inc2+start2]);
1386 template <
typename T>
1390 typedef T value_type;
1392 vec_element_sinh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1396 detail::cuda_arg<value_type>(proxy.lhs()),
1406 T * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1407 T
const * vec2,
unsigned int start2,
unsigned int inc2)
1409 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1410 vec1[i*inc1+
start1] = sqrt(vec2[i*inc2+start2]);
1413 template <
typename T>
1417 typedef T value_type;
1419 vec_element_sqrt_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1423 detail::cuda_arg<value_type>(proxy.lhs()),
1433 T * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1434 T
const * vec2,
unsigned int start2,
unsigned int inc2)
1436 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1437 vec1[i*inc1+
start1] = tan(vec2[i*inc2+start2]);
1440 template <
typename T>
1444 typedef T value_type;
1446 vec_element_tan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1450 detail::cuda_arg<value_type>(proxy.lhs()),
1460 T * vec1,
unsigned int start1,
unsigned int inc1,
unsigned int size1,
1461 T
const * vec2,
unsigned int start2,
unsigned int inc2)
1463 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1464 vec1[i*inc1+
start1] = tanh(vec2[i*inc2+start2]);
1467 template <
typename T>
1471 typedef T value_type;
1473 vec_element_tanh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1477 detail::cuda_arg<value_type>(proxy.lhs()),
1489 template <
typename T>
1500 __shared__ T tmp_buffer[128];
1501 unsigned int group_start1 = (blockIdx.x *
size1) / (gridDim.x) * inc1 +
start1;
1502 unsigned int group_start2 = (blockIdx.x *
size2) / (gridDim.x) * inc2 +
start2;
1504 unsigned int group_size1 = ((blockIdx.x + 1) * size1) / (gridDim.x)
1505 - ( blockIdx.x * size1) / (gridDim.x);
1509 for (
unsigned int i = threadIdx.x; i < group_size1; i += blockDim.x)
1510 tmp += vec1[i*inc1+group_start1] * vec2[i*inc2+group_start2];
1511 tmp_buffer[threadIdx.x] = tmp;
1517 if (threadIdx.x <
stride)
1518 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+
stride];
1521 if (threadIdx.x == 0)
1522 group_buffer[blockIdx.x] = tmp_buffer[0];
1529 template <
typename T>
1535 unsigned int option,
1538 __shared__ T tmp_buffer[128];
1540 for (
unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
1543 thread_sum += vec1[i*inc1+
start1];
1545 thread_sum = fmax(thread_sum, fabs(vec1[i*inc1+start1]));
1548 tmp_buffer[threadIdx.x] = thread_sum;
1553 if (threadIdx.x <
stride)
1556 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x +
stride];
1558 tmp_buffer[threadIdx.x] = fmax(tmp_buffer[threadIdx.x], tmp_buffer[threadIdx.x +
stride]);
1562 if (threadIdx.x == 0)
1565 *result = sqrt(tmp_buffer[0]);
1567 *result = tmp_buffer[0];
1571 template <
typename T>
1577 unsigned int option,
1580 __shared__ T tmp_buffer[128];
1582 for (
unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
1585 thread_sum += vec1[i*inc1+
start1];
1587 thread_sum = thread_sum > abs(vec1[i*inc1+start1]) ? thread_sum : abs(vec1[i*inc1+start1]);
1590 tmp_buffer[threadIdx.x] = thread_sum;
1595 if (threadIdx.x <
stride)
1598 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x +
stride];
1600 tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x +
stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x +
stride];
1604 if (threadIdx.x == 0)
1605 *result = tmp_buffer[0];
1608 template <
typename T>
1614 unsigned int option,
1617 __shared__ T tmp_buffer[128];
1619 for (
unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
1622 thread_sum += vec1[i*inc1+
start1];
1624 thread_sum = (thread_sum > vec1[i*inc1+
start1]) ? thread_sum : vec1[i*inc1+start1];
1627 tmp_buffer[threadIdx.x] = thread_sum;
1632 if (threadIdx.x <
stride)
1635 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x +
stride];
1637 tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x +
stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x +
stride];
1641 if (threadIdx.x == 0)
1642 *result = tmp_buffer[0];
1648 struct vector_sum_kernel_launcher_integers
1650 template <
typename T,
typename S3>
1652 unsigned int option,
1655 typedef T value_type;
1656 vector_sum_kernel_integers<<<1, 128>>>(detail::cuda_arg<value_type>(temp),
1660 static_cast<unsigned int>(option),
1661 detail::cuda_arg<value_type>(result) );
1666 struct vector_sum_kernel_launcher_unsigned_integers
1668 template <
typename T,
typename S3>
1669 static void apply(vector_base<T>
const & temp,
1670 unsigned int option,
1673 typedef T value_type;
1674 vector_sum_kernel_unsigned_integers<<<1, 128>>>(detail::cuda_arg<value_type>(temp),
1678 static_cast<unsigned int>(option),
1679 detail::cuda_arg<value_type>(result) );
1684 struct vector_sum_kernel_launcher_floats
1686 template <
typename T,
typename S3>
1687 static void apply(vector_base<T>
const & temp,
1688 unsigned int option,
1691 typedef T value_type;
1692 vector_sum_kernel_floats<<<1, 128>>>(detail::cuda_arg<value_type>(temp),
1696 static_cast<unsigned int>(option),
1697 detail::cuda_arg<value_type>(result) );
1702 template <
typename T>
1703 struct vector_sum_kernel_launcher :
public vector_sum_kernel_launcher_integers {};
1706 struct vector_sum_kernel_launcher<unsigned char> :
public vector_sum_kernel_launcher_unsigned_integers {};
1709 struct vector_sum_kernel_launcher<unsigned short> :
public vector_sum_kernel_launcher_unsigned_integers {};
1712 struct vector_sum_kernel_launcher<unsigned int> :
public vector_sum_kernel_launcher_unsigned_integers {};
1715 struct vector_sum_kernel_launcher<unsigned long> :
public vector_sum_kernel_launcher_unsigned_integers {};
1718 struct vector_sum_kernel_launcher<float> :
public vector_sum_kernel_launcher_floats {};
1721 struct vector_sum_kernel_launcher<double> :
public vector_sum_kernel_launcher_floats {};
1735 template <
typename T,
typename S3>
1740 typedef T value_type;
1742 static const unsigned int work_groups = 128;
1745 inner_prod_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1749 detail::cuda_arg<value_type>(vec2),
1753 detail::cuda_arg<value_type>(temp)
1757 detail::vector_sum_kernel_launcher<T>::apply(temp, 1, result);
1767 template <
typename T>
1772 typedef T value_type;
1774 const unsigned int work_groups = 128;
1777 inner_prod_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1781 detail::cuda_arg<value_type>(vec2),
1785 detail::cuda_arg<value_type>(temp)
1790 std::vector<value_type> temp_cpu(work_groups);
1794 for (
typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
1800 #define VIENNACL_MDOT_WORKGROUP_SIZE 128
1801 #define VIENNACL_MDOT_WORKGROUP_NUM 128
1803 template <
typename NumericT>
1804 __global__
void inner_prod_2_kernel(
const NumericT *x,
unsigned int startx,
unsigned int stridex,
unsigned int sizex,
1805 const NumericT *y0,
unsigned int start0,
unsigned int stride0,
1806 const NumericT *y1,
unsigned int start1,
unsigned int stride1,
1807 NumericT *group_results)
1810 unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
1811 unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
1812 unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex);
1814 NumericT entry_x = 0;
1815 NumericT group_sum0 = 0;
1816 NumericT group_sum1 = 0;
1817 for (
unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1818 entry_x = x[i * stridex + startx];
1819 group_sum0 += entry_x * y0[i * stride0 + start0];
1820 group_sum1 += entry_x * y1[i * stride1 +
start1];
1822 tmp_buffer[threadIdx.x] = group_sum0;
1823 tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
1828 if (threadIdx.x <
stride) {
1829 tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+
stride ];
1830 tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+
stride + blockDim.x];
1835 if (threadIdx.x == 0) {
1836 group_results[blockIdx.x] = tmp_buffer[0];
1837 group_results[blockIdx.x + gridDim.x] = tmp_buffer[blockDim.x];
1842 template <
typename NumericT>
1843 __global__
void inner_prod_3_kernel(
const NumericT *x,
unsigned int startx,
unsigned int stridex,
unsigned int sizex,
1844 const NumericT *y0,
unsigned int start0,
unsigned int stride0,
1845 const NumericT *y1,
unsigned int start1,
unsigned int stride1,
1846 const NumericT *y2,
unsigned int start2,
unsigned int stride2,
1847 NumericT *group_results)
1850 unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
1851 unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
1852 unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex);
1854 NumericT entry_x = 0;
1855 NumericT group_sum0 = 0;
1856 NumericT group_sum1 = 0;
1857 NumericT group_sum2 = 0;
1858 for (
unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1859 entry_x = x[i * stridex + startx];
1860 group_sum0 += entry_x * y0[i * stride0 + start0];
1861 group_sum1 += entry_x * y1[i * stride1 +
start1];
1862 group_sum2 += entry_x * y2[i * stride2 +
start2];
1864 tmp_buffer[threadIdx.x] = group_sum0;
1865 tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
1866 tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
1871 if (threadIdx.x <
stride) {
1872 tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+
stride ];
1873 tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+
stride + blockDim.x];
1874 tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 2 * blockDim.x];
1879 if (threadIdx.x == 0) {
1880 group_results[blockIdx.x ] = tmp_buffer[0];
1881 group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x];
1882 group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
1887 template <
typename NumericT>
1888 __global__
void inner_prod_4_kernel(
const NumericT *x,
unsigned int startx,
unsigned int stridex,
unsigned int sizex,
1889 const NumericT *y0,
unsigned int start0,
unsigned int stride0,
1890 const NumericT *y1,
unsigned int start1,
unsigned int stride1,
1891 const NumericT *y2,
unsigned int start2,
unsigned int stride2,
1892 const NumericT *y3,
unsigned int start3,
unsigned int stride3,
1893 NumericT *group_results)
1896 unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
1897 unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
1898 unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex);
1900 NumericT entry_x = 0;
1901 NumericT group_sum0 = 0;
1902 NumericT group_sum1 = 0;
1903 NumericT group_sum2 = 0;
1904 NumericT group_sum3 = 0;
1905 for (
unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1906 entry_x = x[i * stridex + startx];
1907 group_sum0 += entry_x * y0[i * stride0 + start0];
1908 group_sum1 += entry_x * y1[i * stride1 +
start1];
1909 group_sum2 += entry_x * y2[i * stride2 +
start2];
1910 group_sum3 += entry_x * y3[i * stride3 + start3];
1912 tmp_buffer[threadIdx.x] = group_sum0;
1913 tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
1914 tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
1915 tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3;
1920 if (threadIdx.x <
stride) {
1921 tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+
stride ];
1922 tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+
stride + blockDim.x];
1923 tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 2 * blockDim.x];
1924 tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 3 * blockDim.x];
1929 if (threadIdx.x == 0) {
1930 group_results[blockIdx.x ] = tmp_buffer[0];
1931 group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x];
1932 group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
1933 group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x];
1938 template <
typename NumericT>
1939 __global__
void inner_prod_8_kernel(
const NumericT *x,
unsigned int startx,
unsigned int stridex,
unsigned int sizex,
1940 const NumericT *y0,
unsigned int start0,
unsigned int stride0,
1941 const NumericT *y1,
unsigned int start1,
unsigned int stride1,
1942 const NumericT *y2,
unsigned int start2,
unsigned int stride2,
1943 const NumericT *y3,
unsigned int start3,
unsigned int stride3,
1944 const NumericT *y4,
unsigned int start4,
unsigned int stride4,
1945 const NumericT *y5,
unsigned int start5,
unsigned int stride5,
1946 const NumericT *y6,
unsigned int start6,
unsigned int stride6,
1947 const NumericT *y7,
unsigned int start7,
unsigned int stride7,
1948 NumericT *group_results)
1951 unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
1952 unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
1953 unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex);
1955 NumericT entry_x = 0;
1956 NumericT group_sum0 = 0;
1957 NumericT group_sum1 = 0;
1958 NumericT group_sum2 = 0;
1959 NumericT group_sum3 = 0;
1960 NumericT group_sum4 = 0;
1961 NumericT group_sum5 = 0;
1962 NumericT group_sum6 = 0;
1963 NumericT group_sum7 = 0;
1964 for (
unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1965 entry_x = x[i * stridex + startx];
1966 group_sum0 += entry_x * y0[i * stride0 + start0];
1967 group_sum1 += entry_x * y1[i * stride1 +
start1];
1968 group_sum2 += entry_x * y2[i * stride2 +
start2];
1969 group_sum3 += entry_x * y3[i * stride3 + start3];
1970 group_sum4 += entry_x * y4[i * stride4 + start4];
1971 group_sum5 += entry_x * y5[i * stride5 + start5];
1972 group_sum6 += entry_x * y6[i * stride6 + start6];
1973 group_sum7 += entry_x * y7[i * stride7 + start7];
1975 tmp_buffer[threadIdx.x] = group_sum0;
1976 tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
1977 tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
1978 tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3;
1979 tmp_buffer[threadIdx.x + 4 * blockDim.x] = group_sum4;
1980 tmp_buffer[threadIdx.x + 5 * blockDim.x] = group_sum5;
1981 tmp_buffer[threadIdx.x + 6 * blockDim.x] = group_sum6;
1982 tmp_buffer[threadIdx.x + 7 * blockDim.x] = group_sum7;
1987 if (threadIdx.x <
stride) {
1988 tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+
stride ];
1989 tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+
stride + blockDim.x];
1990 tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 2 * blockDim.x];
1991 tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 3 * blockDim.x];
1992 tmp_buffer[threadIdx.x + 4 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 4 * blockDim.x];
1993 tmp_buffer[threadIdx.x + 5 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 5 * blockDim.x];
1994 tmp_buffer[threadIdx.x + 6 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 6 * blockDim.x];
1995 tmp_buffer[threadIdx.x + 7 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 7 * blockDim.x];
2000 if (threadIdx.x == 0) {
2001 group_results[blockIdx.x ] = tmp_buffer[0];
2002 group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x];
2003 group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
2004 group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x];
2005 group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * blockDim.x];
2006 group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * blockDim.x];
2007 group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * blockDim.x];
2008 group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * blockDim.x];
2013 template <
typename T>
2017 unsigned int start_result,
2018 unsigned int inc_result)
2027 if (threadIdx.x <
stride)
2028 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x +
stride];
2031 if (threadIdx.x == 0)
2032 result[start_result + inc_result * blockIdx.x] = tmp_buffer[0];
2035 template <
typename T>
2040 typedef T value_type;
2045 while (vec_tuple.
const_size() > current_index)
2047 switch (vec_tuple.
const_size() - current_index)
2064 detail::cuda_arg<value_type>(y0),
2067 detail::cuda_arg<value_type>(y1),
2070 detail::cuda_arg<value_type>(y2),
2073 detail::cuda_arg<value_type>(y3),
2076 detail::cuda_arg<value_type>(temp)
2079 vector_multi_sum_kernel<<<4, VIENNACL_MDOT_WORKGROUP_NUM>>>(detail::cuda_arg<value_type>(temp),
2080 detail::cuda_arg<value_type>(result),
2099 detail::cuda_arg<value_type>(y0),
2102 detail::cuda_arg<value_type>(y1),
2105 detail::cuda_arg<value_type>(y2),
2108 detail::cuda_arg<value_type>(temp)
2111 vector_multi_sum_kernel<<<3, VIENNACL_MDOT_WORKGROUP_NUM>>>(detail::cuda_arg<value_type>(temp),
2112 detail::cuda_arg<value_type>(result),
2130 detail::cuda_arg<value_type>(y0),
2133 detail::cuda_arg<value_type>(y1),
2136 detail::cuda_arg<value_type>(temp)
2139 vector_multi_sum_kernel<<<2, VIENNACL_MDOT_WORKGROUP_NUM>>>(detail::cuda_arg<value_type>(temp),
2140 detail::cuda_arg<value_type>(result),
2151 inner_prod_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(x),
2155 detail::cuda_arg<value_type>(y0),
2159 detail::cuda_arg<value_type>(temp)
2163 vector_multi_sum_kernel<<<1, 128>>>(detail::cuda_arg<value_type>(temp),
2164 detail::cuda_arg<value_type>(result),
2189 detail::cuda_arg<value_type>(y0),
2192 detail::cuda_arg<value_type>(y1),
2195 detail::cuda_arg<value_type>(y2),
2198 detail::cuda_arg<value_type>(y3),
2201 detail::cuda_arg<value_type>(y4),
2204 detail::cuda_arg<value_type>(y5),
2207 detail::cuda_arg<value_type>(y6),
2210 detail::cuda_arg<value_type>(y7),
2213 detail::cuda_arg<value_type>(temp)
2216 vector_multi_sum_kernel<<<8, VIENNACL_MDOT_WORKGROUP_NUM>>>(detail::cuda_arg<value_type>(temp),
2217 detail::cuda_arg<value_type>(result),
2229 #undef VIENNACL_MDOT_WORKGROUP_NUM
2230 #undef VIENNACL_MDOT_WORKGROUP_SIZE
2234 template <
typename T>
2240 unsigned int norm_selector,
2243 __shared__ T tmp_buffer[128];
2246 unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1;
2247 unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x;
2248 unsigned int group_stop = (blockIdx.x + 1) * work_per_thread * blockDim.x;
2249 group_stop = (group_stop > size1) ? size1 : group_stop;
2251 if (norm_selector == 1)
2253 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2254 tmp += fabs(vec[i*inc1 + start1]);
2256 else if (norm_selector == 2)
2259 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2261 vec_entry = vec[i*inc1 +
start1];
2262 tmp += vec_entry * vec_entry;
2265 else if (norm_selector == 0)
2267 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2268 tmp = fmax(fabs(vec[i*inc1 + start1]), tmp);
2271 tmp_buffer[threadIdx.x] = tmp;
2273 if (norm_selector > 0)
2278 if (threadIdx.x <
stride)
2279 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+
stride];
2288 if (threadIdx.x <
stride)
2289 tmp_buffer[threadIdx.x] = fmax(tmp_buffer[threadIdx.x], tmp_buffer[threadIdx.x+
stride]);
2293 if (threadIdx.x == 0)
2294 group_buffer[blockIdx.x] = tmp_buffer[0];
2297 template <
typename T>
2303 unsigned int norm_selector,
2306 __shared__ T tmp_buffer[128];
2309 unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1;
2310 unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x;
2311 unsigned int group_stop = (blockIdx.x + 1) * work_per_thread * blockDim.x;
2312 group_stop = (group_stop > size1) ? size1 : group_stop;
2314 if (norm_selector == 1)
2316 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2317 tmp += abs(vec[i*inc1 + start1]);
2319 else if (norm_selector == 0)
2321 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2322 tmp = (tmp > abs(vec[i*inc1 + start1])) ? tmp : abs(vec[i*inc1 + start1]);
2325 tmp_buffer[threadIdx.x] = tmp;
2327 if (norm_selector > 0)
2332 if (threadIdx.x <
stride)
2333 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+
stride];
2342 if (threadIdx.x <
stride)
2343 tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x+
stride]) ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x+
stride];
2347 if (threadIdx.x == 0)
2348 group_buffer[blockIdx.x] = tmp_buffer[0];
2351 template <
typename T>
2357 unsigned int norm_selector,
2360 __shared__ T tmp_buffer[128];
2363 unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1;
2364 unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x;
2365 unsigned int group_stop = (blockIdx.x + 1) * work_per_thread * blockDim.x;
2366 group_stop = (group_stop > size1) ? size1 : group_stop;
2368 if (norm_selector == 1)
2370 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2371 tmp += vec[i*inc1 +
start1];
2373 else if (norm_selector == 0)
2375 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2376 tmp = (tmp > vec[i*inc1 +
start1]) ? tmp : vec[i*inc1 + start1];
2379 tmp_buffer[threadIdx.x] = tmp;
2381 if (norm_selector > 0)
2386 if (threadIdx.x <
stride)
2387 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+
stride];
2396 if (threadIdx.x <
stride)
2397 tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x+
stride]) ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x+
stride];
2401 if (threadIdx.x == 0)
2402 group_buffer[blockIdx.x] = tmp_buffer[0];
2408 struct norm_kernel_launcher_integers
2410 template <
typename T>
2413 unsigned int option)
2415 typedef T value_type;
2416 norm_kernel_integers<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
2420 static_cast<unsigned int>(option),
2421 detail::cuda_arg<value_type>(temp)
2427 struct norm_kernel_launcher_unsigned_integers
2429 template <
typename T>
2430 static void apply(vector_base<T>
const & vec1,
2431 vector_base<T> & temp,
2432 unsigned int option)
2434 typedef T value_type;
2435 norm_kernel_unsigned_integers<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
2439 static_cast<unsigned int>(option),
2440 detail::cuda_arg<value_type>(temp)
2447 struct norm_kernel_launcher_floats
2449 template <
typename T>
2450 static void apply(vector_base<T>
const & vec1,
2451 vector_base<T> & temp,
2452 unsigned int option)
2454 typedef T value_type;
2455 norm_kernel_floats<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
2459 static_cast<unsigned int>(option),
2460 detail::cuda_arg<value_type>(temp)
2466 template <
typename T>
2467 struct norm_kernel_launcher :
public norm_kernel_launcher_integers {};
2470 struct norm_kernel_launcher<unsigned char> :
public norm_kernel_launcher_unsigned_integers {};
2473 struct norm_kernel_launcher<unsigned short> :
public norm_kernel_launcher_unsigned_integers {};
2476 struct norm_kernel_launcher<unsigned int> :
public norm_kernel_launcher_unsigned_integers {};
2479 struct norm_kernel_launcher<unsigned long> :
public norm_kernel_launcher_unsigned_integers {};
2482 struct norm_kernel_launcher<float> :
public norm_kernel_launcher_floats {};
2485 struct norm_kernel_launcher<double> :
public norm_kernel_launcher_floats {};
2496 template <
typename T>
2500 typedef T value_type;
2505 detail::norm_kernel_launcher<T>::apply(vec1, temp, 1);
2506 detail::vector_sum_kernel_launcher<T>::apply(temp, 1, result);
2514 template <
typename T>
2518 typedef T value_type;
2523 detail::norm_kernel_launcher<T>::apply(vec1, temp, 1);
2526 std::vector<value_type> temp_cpu(work_groups);
2530 for (
typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
2541 template <
typename T>
2545 typedef T value_type;
2550 detail::norm_kernel_launcher<T>::apply(vec1, temp, 2);
2552 detail::vector_sum_kernel_launcher<T>::apply(temp, 2, result);
2560 template <
typename T>
2564 typedef T value_type;
2569 detail::norm_kernel_launcher<T>::apply(vec1, temp, 2);
2571 std::vector<value_type> temp_cpu(work_groups);
2575 for (
typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
2577 result = std::sqrt(result);
2588 template <
typename T>
2592 typedef T value_type;
2597 detail::norm_kernel_launcher<T>::apply(vec1, temp, 0);
2598 detail::vector_sum_kernel_launcher<T>::apply(temp, 0, result);
2608 template <
typename T>
2612 typedef T value_type;
2617 detail::norm_kernel_launcher<T>::apply(vec1, temp, 0);
2619 std::vector<value_type> temp_cpu(work_groups);
2623 for (
typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
2624 result = std::max(result, *it);
2635 template <
typename T>
2636 __device__ T
cuda_abs(T val) {
return (val < 0) ? -val : val; }
2637 __device__
inline unsigned long cuda_abs(
unsigned long val) {
return val; }
2638 __device__
inline unsigned int cuda_abs(
unsigned int val) {
return val; }
2639 __device__
inline unsigned short cuda_abs(
unsigned short val) {
return val; }
2640 __device__
inline unsigned char cuda_abs(
unsigned char val) {
return val; }
2642 template <
typename T>
2647 unsigned int * result)
2649 __shared__ T float_buffer[128];
2650 __shared__
unsigned int index_buffer[128];
2652 float_buffer[threadIdx.x] = 0;
2653 index_buffer[threadIdx.x] = 0;
2658 for (
unsigned int i = threadIdx.x; i < size1; i += blockDim.x)
2660 tmp = vec[i*inc1+
start1];
2664 float_buffer[threadIdx.x] = tmp;
2665 index_buffer[threadIdx.x] = i;
2674 if (threadIdx.x <
stride)
2677 if (float_buffer[threadIdx.x] < float_buffer[threadIdx.x+
stride])
2679 index_buffer[threadIdx.x] = index_buffer[threadIdx.x+
stride];
2680 float_buffer[threadIdx.x] = float_buffer[threadIdx.x+
stride];
2685 if (threadIdx.x == 0)
2686 *result = index_buffer[0];
2697 template <
typename T>
2700 typedef T value_type;
2705 index_norm_inf_kernel<<<1, 128>>>(detail::cuda_arg<value_type>(vec1),
2710 reinterpret_cast<unsigned int *>(h.cuda_handle().get())
2714 unsigned int ret = 0;
2721 template <
typename T>
2737 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += blockDim.x * gridDim.x)
2739 tmp1 = vec1[i*inc1+
start1];
2740 tmp2 = vec2[i*inc2+
start2];
2742 vec1[i*inc1+
start1] = alpha * tmp1 + beta * tmp2;
2743 vec2[i*inc2+
start2] = alpha * tmp2 - beta * tmp1;
2757 template <
typename T>
2762 typedef T value_type;
2764 value_type temporary_alpha = 0;
2766 temporary_alpha = alpha;
2768 value_type temporary_beta = 0;
2770 temporary_beta = beta;
2772 plane_rotation_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
2776 detail::cuda_arg<value_type>(vec2),
__global__ void vec_element_abs_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1242
std::size_t vcl_size_t
Definition: forwards.h:58
unsigned int make_options(vcl_size_t length, bool reciprocal, bool flip_sign)
Definition: common.hpp:37
__global__ void avbv_v_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const T *fac2, unsigned int options2, const T *vec2, unsigned int start2, unsigned int inc2, const T *fac3, unsigned int options3, const T *vec3, unsigned int start3, unsigned int inc3)
Definition: vector_operations.hpp:457
__global__ void vector_assign_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int internal_size1, T alpha)
Definition: vector_operations.hpp:756
__global__ void vector_swap_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:800
result_of::size_type< matrix_base< NumericT, F > >::type stride2(matrix_base< NumericT, F > const &s)
Definition: stride.hpp:68
__global__ void inner_prod_4_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, const NumericT *y0, unsigned int start0, unsigned int stride0, const NumericT *y1, unsigned int start1, unsigned int stride1, const NumericT *y2, unsigned int start2, unsigned int stride2, const NumericT *y3, unsigned int start3, unsigned int stride3, NumericT *group_results)
Definition: vector_operations.hpp:1888
__global__ void vec_element_sin_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1351
__global__ void vector_sum_kernel_unsigned_integers(const T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int option, T *result)
Definition: vector_operations.hpp:1609
Generic size and resize functionality for different vector and matrix types.
viennacl::backend::mem_handle::cuda_handle_type & arg_reference(viennacl::scalar< T > &s, U)
Definition: common.hpp:127
iterator end()
Returns an iterator pointing to the end of the vector (STL like)
Definition: vector.hpp:809
Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc...
__global__ void inner_prod_3_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, const NumericT *y0, unsigned int start0, unsigned int stride0, const NumericT *y1, unsigned int start1, unsigned int stride1, const NumericT *y2, unsigned int start2, unsigned int stride2, NumericT *group_results)
Definition: vector_operations.hpp:1843
This class represents a single scalar value on the GPU and behaves mostly like a built-in scalar type...
Definition: forwards.h:172
void norm_2_cpu(vector_base< T > const &vec1, T &result)
Computes the l^2-norm of a vector - implementation.
Definition: vector_operations.hpp:2561
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:216
__global__ void avbv_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const T *fac2, unsigned int options2, const T *vec2, unsigned int start2, unsigned int inc2, const T *fac3, unsigned int options3, const T *vec3, unsigned int start3, unsigned int inc3)
Definition: vector_operations.hpp:153
void inner_prod_impl(vector_base< T > const &vec1, vector_base< T > const &vec2, S3 &result)
Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1...
Definition: vector_operations.hpp:1736
__global__ void vec_element_cosh_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1162
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:46
Common base class for dense vectors, vector ranges, and vector slices.
Definition: forwards.h:205
This file provides the forward declarations for the main types used within ViennaCL.
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:64
Determines row and column increments for matrices and matrix proxies.
void memory_read(mem_handle const &src_buffer, vcl_size_t src_offset, vcl_size_t bytes_to_read, void *ptr, bool async=false)
Reads data from a buffer back to main RAM.
Definition: memory.hpp:261
__global__ void vec_element_tanh_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1459
void vector_swap(vector_base< T > &vec1, vector_base< T > &vec2)
Swaps the contents of two vectors, data is copied.
Definition: vector_operations.hpp:827
Tuple class holding pointers to multiple vectors. Mainly used as a temporary object returned from vie...
Definition: forwards.h:211
#define VIENNACL_MDOT_WORKGROUP_SIZE
Definition: vector_operations.hpp:1800
void plane_rotation(vector_base< T > &vec1, vector_base< T > &vec2, T alpha, T beta)
Computes a plane rotation of two vectors.
Definition: vector_operations.hpp:2758
__global__ void vec_element_log_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1297
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:245
void avbv(vector_base< T > &vec1, vector_base< T > const &vec2, ScalarType1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, vector_base< T > const &vec3, ScalarType2 const &beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
Definition: vector_operations.hpp:407
__global__ void vec_element_tan_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1432
void av(vector_base< T > &vec1, vector_base< T > const &vec2, ScalarType1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
Definition: vector_operations.hpp:118
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
Definition: common.hpp:27
__global__ void vector_sum_kernel_floats(const T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int option, T *result)
Definition: vector_operations.hpp:1530
void avbv_v(vector_base< T > &vec1, vector_base< T > const &vec2, ScalarType1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, vector_base< T > const &vec3, ScalarType2 const &beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
Definition: vector_operations.hpp:709
__global__ void vec_element_atan_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1081
#define VIENNACL_MDOT_WORKGROUP_NUM
Definition: vector_operations.hpp:1801
__global__ void vec_element_fabs_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1216
result_of::size_type< matrix_base< NumericT, F > >::type stride1(matrix_base< NumericT, F > const &s)
Definition: stride.hpp:57
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:29
size_type internal_size() const
Returns the internal length of the vector, which is given by size() plus the extra memory due to padd...
Definition: vector.hpp:841
void vector_assign(vector_base< T > &vec1, const S1 &alpha, bool up_to_internal_size=false)
Assign a constant value to a vector (-range/-slice)
Definition: vector_operations.hpp:777
void element_op(matrix_base< T, F > &A, matrix_expression< const matrix_base< T, F >, const matrix_base< T, F >, op_element_binary< OP > > const &proxy)
Definition: matrix_operations.hpp:489
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:144
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:83
__global__ void vec_element_acos_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1028
Helper struct for checking whether a type is a host scalar type (e.g. float, double) ...
Definition: forwards.h:363
__global__ void vec_element_sinh_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1378
__device__ T cuda_abs(T val)
Definition: vector_operations.hpp:2636
__global__ void norm_kernel_floats(const T *vec, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int norm_selector, T *group_buffer)
Definition: vector_operations.hpp:2235
void norm_1_cpu(vector_base< T > const &vec1, T &result)
Computes the l^1-norm of a vector.
Definition: vector_operations.hpp:2515
vcl_size_t index_norm_inf(vector_base< T > const &vec1)
Computes the index of the first entry that is equal to the supremum-norm in modulus.
Definition: vector_operations.hpp:2698
result_of::size_type< T >::type start(T const &obj)
Definition: start.hpp:43
__global__ void inner_prod_kernel(const T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const T *vec2, unsigned int start2, unsigned int inc2, unsigned int size2, T *group_buffer)
Definition: vector_operations.hpp:1490
__global__ void vector_multi_sum_kernel(T const *vec1, T *result, unsigned int start_result, unsigned int inc_result)
Definition: vector_operations.hpp:2014
__global__ void plane_rotation_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T *vec2, unsigned int start2, unsigned int inc2, unsigned int size2, T alpha, T beta)
Definition: vector_operations.hpp:2722
__global__ void vec_element_exp_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1189
__global__ void vec_element_cos_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1135
An expression template class that represents a binary operation that yields a vector.
Definition: forwards.h:181
iterator begin()
Returns an iterator pointing to the beginning of the vector (STL like)
Definition: vector.hpp:803
VectorType const & const_at(vcl_size_t i) const
Definition: vector.hpp:1174
__global__ void norm_kernel_integers(const T *vec, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int norm_selector, T *group_buffer)
Definition: vector_operations.hpp:2298
Helper metafunction for checking whether the provided type is viennacl::op_div (for division) ...
Definition: predicate.hpp:448
void norm_inf_impl(vector_base< T > const &vec1, scalar< T > &result)
Computes the supremum-norm of a vector.
Definition: vector_operations.hpp:2589
All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.
void norm_1_impl(vector_base< T > const &vec1, scalar< T > &result)
Computes the l^1-norm of a vector.
Definition: vector_operations.hpp:2497
A vector class representing a linear memory sequence on the GPU. Inspired by boost::numeric::ublas::v...
Definition: forwards.h:208
__global__ void vec_element_log10_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1324
viennacl::context context(T const &t)
Returns an ID for the currently active memory domain of an object.
Definition: context.hpp:41
__global__ void element_op_int_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2, T const *vec3, unsigned int start3, unsigned int inc3, unsigned int op_type)
Definition: vector_operations.hpp:891
__global__ void vec_element_asin_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1054
void norm_2_impl(vector_base< T > const &vec1, scalar< T > &result)
Computes the l^2-norm of a vector - implementation.
Definition: vector_operations.hpp:2542
__global__ void vec_element_floor_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1270
__global__ void vec_element_ceil_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1108
__global__ void norm_kernel_unsigned_integers(const T *vec, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int norm_selector, T *group_buffer)
Definition: vector_operations.hpp:2352
void inner_prod_cpu(vector_base< T > const &vec1, vector_base< T > const &vec2, T &result)
Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1...
Definition: vector_operations.hpp:1768
__global__ void inner_prod_2_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, const NumericT *y0, unsigned int start0, unsigned int stride0, const NumericT *y1, unsigned int start1, unsigned int stride1, NumericT *group_results)
Definition: vector_operations.hpp:1804
vcl_size_t const_size() const
Definition: vector.hpp:1171
Main abstraction class for multiple memory domains. Represents a buffer in either main RAM...
Definition: mem_handle.hpp:62
A tag class representing element-wise binary operations (like multiplication) on vectors or matrices...
Definition: forwards.h:86
void memory_create(mem_handle &handle, vcl_size_t size_in_bytes, viennacl::context const &ctx, const void *host_ptr=NULL)
Creates an array of the specified size. If the second argument is provided, the buffer is initialized...
Definition: memory.hpp:87
__global__ void index_norm_inf_kernel(const T *vec, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int *result)
Definition: vector_operations.hpp:2643
__global__ void vec_element_sqrt_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:1405
__global__ void element_op_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, T const *vec2, unsigned int start2, unsigned int inc2, T const *vec3, unsigned int start3, unsigned int inc3, unsigned int op_type)
Definition: vector_operations.hpp:845
vcl_size_t internal_size1(matrix_base< NumericT, F > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
Definition: size.hpp:279
Helper metafunction for checking whether the provided type is viennacl::op_prod (for products/multipl...
Definition: predicate.hpp:418
A tag class representing element-wise unary operations (like sin()) on vectors or matrices...
Definition: forwards.h:90
Implementation of the ViennaCL scalar class.
void fast_copy(const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_begin, const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_end, CPU_ITERATOR cpu_begin)
STL-like transfer of a GPU vector to the CPU. The cpu type is assumed to reside in a linear piece of ...
Definition: vector.hpp:1262
__global__ void inner_prod_8_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, const NumericT *y0, unsigned int start0, unsigned int stride0, const NumericT *y1, unsigned int start1, unsigned int stride1, const NumericT *y2, unsigned int start2, unsigned int stride2, const NumericT *y3, unsigned int start3, unsigned int stride3, const NumericT *y4, unsigned int start4, unsigned int stride4, const NumericT *y5, unsigned int start5, unsigned int stride5, const NumericT *y6, unsigned int start6, unsigned int stride6, const NumericT *y7, unsigned int start7, unsigned int stride7, NumericT *group_results)
Definition: vector_operations.hpp:1939
void norm_inf_cpu(vector_base< T > const &vec1, T &result)
Computes the supremum-norm of a vector.
Definition: vector_operations.hpp:2609
Simple enable-if variant that uses the SFINAE pattern.
__global__ void vector_sum_kernel_integers(const T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int option, T *result)
Definition: vector_operations.hpp:1572
__global__ void av_kernel(T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const T *fac2, unsigned int options2, const T *vec2, unsigned int start2, unsigned int inc2)
Definition: vector_operations.hpp:51