ViennaCL - The Vienna Computing Library  1.5.2
matrix_operations_col.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_COL_HPP_
2 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_COL_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
26 namespace viennacl
27 {
28  namespace linalg
29  {
30  namespace cuda
31  {
32  //
33  // am
34  //
35 
36  // alpha on CPU
37  template <typename T>
38  __global__ void am_col_kernel(
39  T * A,
40  unsigned int A_start1, unsigned int A_start2,
41  unsigned int A_inc1, unsigned int A_inc2,
42  unsigned int A_size1, unsigned int A_size2,
43  unsigned int A_internal_size1, unsigned int A_internal_size2,
44 
45  T fac2,
46  unsigned int options2,
47  const T * B,
48  unsigned int B_start1, unsigned int B_start2,
49  unsigned int B_inc1, unsigned int B_inc2,
50  unsigned int B_internal_size1, unsigned int B_internal_size2)
51  {
52  T alpha = fac2;
53  if (options2 & (1 << 0))
54  alpha = -alpha;
55 
56  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
57  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
58 
59  if (options2 & (1 << 1))
60  {
61  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
62  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
63  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha;
64  }
65  else
66  {
67  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
68  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
69  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha;
70  }
71  }
72 
73  // alpha on GPU
74  template <typename T>
75  __global__ void am_col_kernel(
76  T * A,
77  unsigned int A_start1, unsigned int A_start2,
78  unsigned int A_inc1, unsigned int A_inc2,
79  unsigned int A_size1, unsigned int A_size2,
80  unsigned int A_internal_size1, unsigned int A_internal_size2,
81 
82  const T * fac2,
83  unsigned int options2,
84  const T * B,
85  unsigned int B_start1, unsigned int B_start2,
86  unsigned int B_inc1, unsigned int B_inc2,
87  unsigned int B_internal_size1, unsigned int B_internal_size2)
88  {
89  T alpha = *fac2;
90  if (options2 & (1 << 0))
91  alpha = -alpha;
92 
93  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
94  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
95 
96  if (options2 & (1 << 1))
97  {
98  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
99  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
100  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha;
101  }
102  else
103  {
104  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
105  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
106  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha;
107  }
108  }
109 
110 
111  //
112  // ambm
113  //
114 
115  // alpha and beta on CPU
116  template <typename T>
117  __global__ void ambm_col_kernel(
118  T * A,
119  unsigned int A_start1, unsigned int A_start2,
120  unsigned int A_inc1, unsigned int A_inc2,
121  unsigned int A_size1, unsigned int A_size2,
122  unsigned int A_internal_size1, unsigned int A_internal_size2,
123 
124  T fac2,
125  unsigned int options2,
126  const T * B,
127  unsigned int B_start1, unsigned int B_start2,
128  unsigned int B_inc1, unsigned int B_inc2,
129  unsigned int B_internal_size1, unsigned int B_internal_size2,
130 
131  T fac3,
132  unsigned int options3,
133  const T * C,
134  unsigned int C_start1, unsigned int C_start2,
135  unsigned int C_inc1, unsigned int C_inc2,
136  unsigned int C_internal_size1, unsigned int C_internal_size2)
137  {
138  T alpha = fac2;
139  if (options2 & (1 << 0))
140  alpha = -alpha;
141 
142  T beta = fac3;
143  if (options3 & (1 << 0))
144  beta = -beta;
145 
146  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
147  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
148 
149  if (options2 & (1 << 1))
150  {
151  if (options3 & (1 << 1))
152  {
153  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
154  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
155  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
156  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
157  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
158  }
159  else
160  {
161  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
162  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
163  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
164  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
165  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
166  }
167  }
168  else
169  {
170  if (options3 & (1 << 1))
171  {
172  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
173  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
174  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
175  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
176  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
177  }
178  else
179  {
180  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
181  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
182  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
183  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
184  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
185  }
186  }
187  }
188 
189 
190  // alpha on CPU, beta on GPU
191  template <typename T>
192  __global__ void ambm_col_kernel(
193  T * A,
194  unsigned int A_start1, unsigned int A_start2,
195  unsigned int A_inc1, unsigned int A_inc2,
196  unsigned int A_size1, unsigned int A_size2,
197  unsigned int A_internal_size1, unsigned int A_internal_size2,
198 
199  T fac2,
200  unsigned int options2,
201  const T * B,
202  unsigned int B_start1, unsigned int B_start2,
203  unsigned int B_inc1, unsigned int B_inc2,
204  unsigned int B_internal_size1, unsigned int B_internal_size2,
205 
206  const T * fac3,
207  unsigned int options3,
208  const T * C,
209  unsigned int C_start1, unsigned int C_start2,
210  unsigned int C_inc1, unsigned int C_inc2,
211  unsigned int C_internal_size1, unsigned int C_internal_size2)
212  {
213  T alpha = fac2;
214  if (options2 & (1 << 0))
215  alpha = -alpha;
216 
217  T beta = *fac3;
218  if (options3 & (1 << 0))
219  beta = -beta;
220 
221  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
222  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
223 
224  if (options2 & (1 << 1))
225  {
226  if (options3 & (1 << 1))
227  {
228  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
229  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
230  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
231  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
232  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
233  }
234  else
235  {
236  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
237  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
238  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
239  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
240  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
241  }
242  }
243  else
244  {
245  if (options3 & (1 << 1))
246  {
247  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
248  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
249  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
250  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
251  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
252  }
253  else
254  {
255  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
256  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
257  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
258  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
259  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
260  }
261  }
262  }
263 
264  // alpha on GPU, beta on CPU
265  template <typename T>
266  __global__ void ambm_col_kernel(
267  T * A,
268  unsigned int A_start1, unsigned int A_start2,
269  unsigned int A_inc1, unsigned int A_inc2,
270  unsigned int A_size1, unsigned int A_size2,
271  unsigned int A_internal_size1, unsigned int A_internal_size2,
272 
273  const T * fac2,
274  unsigned int options2,
275  const T * B,
276  unsigned int B_start1, unsigned int B_start2,
277  unsigned int B_inc1, unsigned int B_inc2,
278  unsigned int B_internal_size1, unsigned int B_internal_size2,
279 
280  T fac3,
281  unsigned int options3,
282  const T * C,
283  unsigned int C_start1, unsigned int C_start2,
284  unsigned int C_inc1, unsigned int C_inc2,
285  unsigned int C_internal_size1, unsigned int C_internal_size2)
286  {
287  T alpha = *fac2;
288  if (options2 & (1 << 0))
289  alpha = -alpha;
290 
291  T beta = fac3;
292  if (options3 & (1 << 0))
293  beta = -beta;
294 
295  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
296  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
297 
298  if (options2 & (1 << 1))
299  {
300  if (options3 & (1 << 1))
301  {
302  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
303  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
304  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
305  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
306  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
307  }
308  else
309  {
310  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
311  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
312  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
313  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
314  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
315  }
316  }
317  else
318  {
319  if (options3 & (1 << 1))
320  {
321  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
322  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
323  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
324  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
325  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
326  }
327  else
328  {
329  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
330  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
331  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
332  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
333  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
334  }
335  }
336  }
337 
338 
339  // alpha and beta on GPU
340  template <typename T>
341  __global__ void ambm_col_kernel(
342  T * A,
343  unsigned int A_start1, unsigned int A_start2,
344  unsigned int A_inc1, unsigned int A_inc2,
345  unsigned int A_size1, unsigned int A_size2,
346  unsigned int A_internal_size1, unsigned int A_internal_size2,
347 
348  const T * fac2,
349  unsigned int options2,
350  const T * B,
351  unsigned int B_start1, unsigned int B_start2,
352  unsigned int B_inc1, unsigned int B_inc2,
353  unsigned int B_internal_size1, unsigned int B_internal_size2,
354 
355  const T * fac3,
356  unsigned int options3,
357  const T * C,
358  unsigned int C_start1, unsigned int C_start2,
359  unsigned int C_inc1, unsigned int C_inc2,
360  unsigned int C_internal_size1, unsigned int C_internal_size2)
361  {
362  T alpha = *fac2;
363  if (options2 & (1 << 0))
364  alpha = -alpha;
365 
366  T beta = *fac3;
367  if (options3 & (1 << 0))
368  beta = -beta;
369 
370  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
371  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
372 
373  if (options2 & (1 << 1))
374  {
375  if (options3 & (1 << 1))
376  {
377  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
378  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
379  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
380  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
381  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
382  }
383  else
384  {
385  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
386  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
387  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
388  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
389  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
390  }
391  }
392  else
393  {
394  if (options3 & (1 << 1))
395  {
396  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
397  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
398  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
399  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
400  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
401  }
402  else
403  {
404  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
405  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
406  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
407  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
408  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
409  }
410  }
411  }
412 
413 
414  //
415  // ambm_m
416  //
417 
418  // alpha and beta on CPU
419  template <typename T>
420  __global__ void ambm_m_col_kernel(
421  T * A,
422  unsigned int A_start1, unsigned int A_start2,
423  unsigned int A_inc1, unsigned int A_inc2,
424  unsigned int A_size1, unsigned int A_size2,
425  unsigned int A_internal_size1, unsigned int A_internal_size2,
426 
427  T fac2,
428  unsigned int options2,
429  const T * B,
430  unsigned int B_start1, unsigned int B_start2,
431  unsigned int B_inc1, unsigned int B_inc2,
432  unsigned int B_internal_size1, unsigned int B_internal_size2,
433 
434  T fac3,
435  unsigned int options3,
436  const T * C,
437  unsigned int C_start1, unsigned int C_start2,
438  unsigned int C_inc1, unsigned int C_inc2,
439  unsigned int C_internal_size1, unsigned int C_internal_size2)
440  {
441  T alpha = fac2;
442  if (options2 & (1 << 0))
443  alpha = -alpha;
444 
445  T beta = fac3;
446  if (options3 & (1 << 0))
447  beta = -beta;
448 
449  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
450  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
451 
452  if (options2 & (1 << 1))
453  {
454  if (options3 & (1 << 1))
455  {
456  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
457  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
458  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
459  += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
460  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
461  }
462  else
463  {
464  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
465  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
466  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
467  += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
468  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
469  }
470  }
471  else
472  {
473  if (options3 & (1 << 1))
474  {
475  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
476  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
477  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
478  += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
479  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
480  }
481  else
482  {
483  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
484  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
485  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
486  += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
487  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
488  }
489  }
490  }
491 
492 
493  // alpha on CPU, beta on GPU
494  template <typename T>
495  __global__ void ambm_m_col_kernel(
496  T * A,
497  unsigned int A_start1, unsigned int A_start2,
498  unsigned int A_inc1, unsigned int A_inc2,
499  unsigned int A_size1, unsigned int A_size2,
500  unsigned int A_internal_size1, unsigned int A_internal_size2,
501 
502  T fac2,
503  unsigned int options2,
504  const T * B,
505  unsigned int B_start1, unsigned int B_start2,
506  unsigned int B_inc1, unsigned int B_inc2,
507  unsigned int B_internal_size1, unsigned int B_internal_size2,
508 
509  const T * fac3,
510  unsigned int options3,
511  const T * C,
512  unsigned int C_start1, unsigned int C_start2,
513  unsigned int C_inc1, unsigned int C_inc2,
514  unsigned int C_internal_size1, unsigned int C_internal_size2)
515  {
516  T alpha = fac2;
517  if (options2 & (1 << 0))
518  alpha = -alpha;
519 
520  T beta = *fac3;
521  if (options3 & (1 << 0))
522  beta = -beta;
523 
524  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
525  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
526 
527  if (options2 & (1 << 1))
528  {
529  if (options3 & (1 << 1))
530  {
531  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
532  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
533  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
534  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
535  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
536  }
537  else
538  {
539  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
540  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
541  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
542  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
543  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
544  }
545  }
546  else
547  {
548  if (options3 & (1 << 1))
549  {
550  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
551  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
552  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
553  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
554  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
555  }
556  else
557  {
558  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
559  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
560  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
561  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
562  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
563  }
564  }
565  }
566 
567  // alpha on GPU, beta on CPU
568  template <typename T>
569  __global__ void ambm_m_col_kernel(
570  T * A,
571  unsigned int A_start1, unsigned int A_start2,
572  unsigned int A_inc1, unsigned int A_inc2,
573  unsigned int A_size1, unsigned int A_size2,
574  unsigned int A_internal_size1, unsigned int A_internal_size2,
575 
576  const T * fac2,
577  unsigned int options2,
578  const T * B,
579  unsigned int B_start1, unsigned int B_start2,
580  unsigned int B_inc1, unsigned int B_inc2,
581  unsigned int B_internal_size1, unsigned int B_internal_size2,
582 
583  T fac3,
584  unsigned int options3,
585  const T * C,
586  unsigned int C_start1, unsigned int C_start2,
587  unsigned int C_inc1, unsigned int C_inc2,
588  unsigned int C_internal_size1, unsigned int C_internal_size2)
589  {
590  T alpha = *fac2;
591  if (options2 & (1 << 0))
592  alpha = -alpha;
593 
594  T beta = fac3;
595  if (options3 & (1 << 0))
596  beta = -beta;
597 
598  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
599  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
600 
601  if (options2 & (1 << 1))
602  {
603  if (options3 & (1 << 1))
604  {
605  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
606  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
607  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
608  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
609  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
610  }
611  else
612  {
613  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
614  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
615  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
616  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
617  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
618  }
619  }
620  else
621  {
622  if (options3 & (1 << 1))
623  {
624  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
625  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
626  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
627  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
628  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
629  }
630  else
631  {
632  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
633  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
634  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
635  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
636  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
637  }
638  }
639  }
640 
641 
642  // alpha and beta on GPU
643  template <typename T>
644  __global__ void ambm_m_col_kernel(
645  T * A,
646  unsigned int A_start1, unsigned int A_start2,
647  unsigned int A_inc1, unsigned int A_inc2,
648  unsigned int A_size1, unsigned int A_size2,
649  unsigned int A_internal_size1, unsigned int A_internal_size2,
650 
651  const T * fac2,
652  unsigned int options2,
653  const T * B,
654  unsigned int B_start1, unsigned int B_start2,
655  unsigned int B_inc1, unsigned int B_inc2,
656  unsigned int B_internal_size1, unsigned int B_internal_size2,
657 
658  const T * fac3,
659  unsigned int options3,
660  const T * C,
661  unsigned int C_start1, unsigned int C_start2,
662  unsigned int C_inc1, unsigned int C_inc2,
663  unsigned int C_internal_size1, unsigned int C_internal_size2)
664  {
665  T alpha = *fac2;
666  if (options2 & (1 << 0))
667  alpha = -alpha;
668 
669  T beta = *fac3;
670  if (options3 & (1 << 0))
671  beta = -beta;
672 
673  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
674  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
675 
676  if (options2 & (1 << 1))
677  {
678  if (options3 & (1 << 1))
679  {
680  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
681  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
682  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
683  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
684  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
685  }
686  else
687  {
688  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
689  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
690  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
691  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
692  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
693  }
694  }
695  else
696  {
697  if (options3 & (1 << 1))
698  {
699  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
700  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
701  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
702  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
703  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
704  }
705  else
706  {
707  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
708  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
709  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
710  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
711  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
712  }
713  }
714  }
715 
716 
717 
718  //
719  // assignments
720  //
721 
722  template <typename T>
723  __global__ void matrix_col_assign_kernel(
724  T * A,
725  unsigned int A_start1, unsigned int A_start2,
726  unsigned int A_inc1, unsigned int A_inc2,
727  unsigned int A_size1, unsigned int A_size2,
728  unsigned int A_internal_size1, unsigned int A_internal_size2,
729  T alpha)
730  {
731  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
732  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
733 
734  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
735  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
736  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = alpha;
737  }
738 
739 
740  template <typename T>
742  T * A,
743  unsigned int A_start1, unsigned int A_start2,
744  unsigned int A_inc1, unsigned int A_inc2,
745  unsigned int A_size1, unsigned int A_size2,
746  unsigned int A_internal_size1, unsigned int A_internal_size2,
747  T alpha)
748  {
749  unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x);
750 
751  for (unsigned int row = gid; row < A_size1; row += blockDim.x * gridDim.x)
752  A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1] = alpha;
753  }
754 
755  //
756  // binary element-wise operations
757  //
758 
759  template <typename T>
760  __global__ void element_op_col_kernel(
761  T * A,
762  unsigned int A_start1, unsigned int A_start2,
763  unsigned int A_inc1, unsigned int A_inc2,
764  unsigned int A_size1, unsigned int A_size2,
765  unsigned int A_internal_size1, unsigned int A_internal_size2,
766 
767  const T * B,
768  unsigned int B_start1, unsigned int B_start2,
769  unsigned int B_inc1, unsigned int B_inc2,
770  unsigned int B_internal_size1, unsigned int B_internal_size2,
771 
772  const T * C,
773  unsigned int C_start1, unsigned int C_start2,
774  unsigned int C_inc1, unsigned int C_inc2,
775  unsigned int C_internal_size1, unsigned int C_internal_size2,
776 
777  unsigned int op_type) //0: product, 1: division, 2: pow
778  {
779  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
780  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
781 
782  if (op_type == 2)
783  {
784  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
785  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
786  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
787  = pow(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1],
788  C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]);
789  }
790  else if (op_type == 1)
791  {
792  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
793  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
794  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
795  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
796  / C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
797  }
798  else if (op_type == 0)
799  {
800  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
801  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
802  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
803  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
804  * C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
805  }
806  }
807 
808  template <typename T>
809  __global__ void element_op_int_col_kernel(
810  T * A,
811  unsigned int A_start1, unsigned int A_start2,
812  unsigned int A_inc1, unsigned int A_inc2,
813  unsigned int A_size1, unsigned int A_size2,
814  unsigned int A_internal_size1, unsigned int A_internal_size2,
815 
816  const T * B,
817  unsigned int B_start1, unsigned int B_start2,
818  unsigned int B_inc1, unsigned int B_inc2,
819  unsigned int B_internal_size1, unsigned int B_internal_size2,
820 
821  const T * C,
822  unsigned int C_start1, unsigned int C_start2,
823  unsigned int C_inc1, unsigned int C_inc2,
824  unsigned int C_internal_size1, unsigned int C_internal_size2,
825 
826  unsigned int op_type) //0: product, 1: division, 2: pow
827  {
828  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
829  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
830 
831  if (op_type == 1)
832  {
833  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
834  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
835  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
836  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
837  / C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
838  }
839  else if (op_type == 0)
840  {
841  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
842  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
843  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
844  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
845  * C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
846  }
847  }
848 
849 
850  //
851  // unary element-wise operations
852  //
853 
854  // abs
855  template <typename T>
857  T * A,
858  unsigned int A_start1, unsigned int A_start2,
859  unsigned int A_inc1, unsigned int A_inc2,
860  unsigned int A_size1, unsigned int A_size2,
861  unsigned int A_internal_size1, unsigned int A_internal_size2,
862 
863  const T * B,
864  unsigned int B_start1, unsigned int B_start2,
865  unsigned int B_inc1, unsigned int B_inc2,
866  unsigned int B_internal_size1, unsigned int B_internal_size2)
867  {
868  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
869  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
870 
871  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
872  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
873  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = abs(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
874  }
875 
876 
877  // acos
878  template <typename T>
880  T * A,
881  unsigned int A_start1, unsigned int A_start2,
882  unsigned int A_inc1, unsigned int A_inc2,
883  unsigned int A_size1, unsigned int A_size2,
884  unsigned int A_internal_size1, unsigned int A_internal_size2,
885 
886  const T * B,
887  unsigned int B_start1, unsigned int B_start2,
888  unsigned int B_inc1, unsigned int B_inc2,
889  unsigned int B_internal_size1, unsigned int B_internal_size2)
890  {
891  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
892  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
893 
894  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
895  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
896  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = acos(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
897  }
898 
899 
900  // asin
901  template <typename T>
903  T * A,
904  unsigned int A_start1, unsigned int A_start2,
905  unsigned int A_inc1, unsigned int A_inc2,
906  unsigned int A_size1, unsigned int A_size2,
907  unsigned int A_internal_size1, unsigned int A_internal_size2,
908 
909  const T * B,
910  unsigned int B_start1, unsigned int B_start2,
911  unsigned int B_inc1, unsigned int B_inc2,
912  unsigned int B_internal_size1, unsigned int B_internal_size2)
913  {
914  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
915  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
916 
917  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
918  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
919  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = asin(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
920  }
921 
922 
923  // atan
924  template <typename T>
926  T * A,
927  unsigned int A_start1, unsigned int A_start2,
928  unsigned int A_inc1, unsigned int A_inc2,
929  unsigned int A_size1, unsigned int A_size2,
930  unsigned int A_internal_size1, unsigned int A_internal_size2,
931 
932  const T * B,
933  unsigned int B_start1, unsigned int B_start2,
934  unsigned int B_inc1, unsigned int B_inc2,
935  unsigned int B_internal_size1, unsigned int B_internal_size2)
936  {
937  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
938  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
939 
940  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
941  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
942  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = atan(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
943  }
944 
945 
946  // ceil
947  template <typename T>
949  T * A,
950  unsigned int A_start1, unsigned int A_start2,
951  unsigned int A_inc1, unsigned int A_inc2,
952  unsigned int A_size1, unsigned int A_size2,
953  unsigned int A_internal_size1, unsigned int A_internal_size2,
954 
955  const T * B,
956  unsigned int B_start1, unsigned int B_start2,
957  unsigned int B_inc1, unsigned int B_inc2,
958  unsigned int B_internal_size1, unsigned int B_internal_size2)
959  {
960  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
961  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
962 
963  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
964  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
965  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = ceil(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
966  }
967 
968 
969  // cos
970  template <typename T>
972  T * A,
973  unsigned int A_start1, unsigned int A_start2,
974  unsigned int A_inc1, unsigned int A_inc2,
975  unsigned int A_size1, unsigned int A_size2,
976  unsigned int A_internal_size1, unsigned int A_internal_size2,
977 
978  const T * B,
979  unsigned int B_start1, unsigned int B_start2,
980  unsigned int B_inc1, unsigned int B_inc2,
981  unsigned int B_internal_size1, unsigned int B_internal_size2)
982  {
983  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
984  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
985 
986  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
987  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
988  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = cos(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
989  }
990 
991 
992  // cosh
993  template <typename T>
995  T * A,
996  unsigned int A_start1, unsigned int A_start2,
997  unsigned int A_inc1, unsigned int A_inc2,
998  unsigned int A_size1, unsigned int A_size2,
999  unsigned int A_internal_size1, unsigned int A_internal_size2,
1000 
1001  const T * B,
1002  unsigned int B_start1, unsigned int B_start2,
1003  unsigned int B_inc1, unsigned int B_inc2,
1004  unsigned int B_internal_size1, unsigned int B_internal_size2)
1005  {
1006  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1007  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1008 
1009  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1010  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1011  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = cosh(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1012  }
1013 
1014 
1015  // exp
1016  template <typename T>
1018  T * A,
1019  unsigned int A_start1, unsigned int A_start2,
1020  unsigned int A_inc1, unsigned int A_inc2,
1021  unsigned int A_size1, unsigned int A_size2,
1022  unsigned int A_internal_size1, unsigned int A_internal_size2,
1023 
1024  const T * B,
1025  unsigned int B_start1, unsigned int B_start2,
1026  unsigned int B_inc1, unsigned int B_inc2,
1027  unsigned int B_internal_size1, unsigned int B_internal_size2)
1028  {
1029  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1030  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1031 
1032  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1033  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1034  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = exp(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1035  }
1036 
1037 
1038  // fabs
1039  template <typename T>
1041  T * A,
1042  unsigned int A_start1, unsigned int A_start2,
1043  unsigned int A_inc1, unsigned int A_inc2,
1044  unsigned int A_size1, unsigned int A_size2,
1045  unsigned int A_internal_size1, unsigned int A_internal_size2,
1046 
1047  const T * B,
1048  unsigned int B_start1, unsigned int B_start2,
1049  unsigned int B_inc1, unsigned int B_inc2,
1050  unsigned int B_internal_size1, unsigned int B_internal_size2)
1051  {
1052  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1053  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1054 
1055  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1056  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1057  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = fabs(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1058  }
1059 
1060 
1061  // floor
1062  template <typename T>
1064  T * A,
1065  unsigned int A_start1, unsigned int A_start2,
1066  unsigned int A_inc1, unsigned int A_inc2,
1067  unsigned int A_size1, unsigned int A_size2,
1068  unsigned int A_internal_size1, unsigned int A_internal_size2,
1069 
1070  const T * B,
1071  unsigned int B_start1, unsigned int B_start2,
1072  unsigned int B_inc1, unsigned int B_inc2,
1073  unsigned int B_internal_size1, unsigned int B_internal_size2)
1074  {
1075  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1076  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1077 
1078  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1079  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1080  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = floor(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1081  }
1082 
1083 
1084  // log
1085  template <typename T>
1087  T * A,
1088  unsigned int A_start1, unsigned int A_start2,
1089  unsigned int A_inc1, unsigned int A_inc2,
1090  unsigned int A_size1, unsigned int A_size2,
1091  unsigned int A_internal_size1, unsigned int A_internal_size2,
1092 
1093  const T * B,
1094  unsigned int B_start1, unsigned int B_start2,
1095  unsigned int B_inc1, unsigned int B_inc2,
1096  unsigned int B_internal_size1, unsigned int B_internal_size2)
1097  {
1098  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1099  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1100 
1101  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1102  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1103  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = log(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1104  }
1105 
1106 
1107  // log10
1108  template <typename T>
1110  T * A,
1111  unsigned int A_start1, unsigned int A_start2,
1112  unsigned int A_inc1, unsigned int A_inc2,
1113  unsigned int A_size1, unsigned int A_size2,
1114  unsigned int A_internal_size1, unsigned int A_internal_size2,
1115 
1116  const T * B,
1117  unsigned int B_start1, unsigned int B_start2,
1118  unsigned int B_inc1, unsigned int B_inc2,
1119  unsigned int B_internal_size1, unsigned int B_internal_size2)
1120  {
1121  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1122  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1123 
1124  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1125  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1126  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = log10(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1127  }
1128 
1129 
1130  // sin
1131  template <typename T>
1133  T * A,
1134  unsigned int A_start1, unsigned int A_start2,
1135  unsigned int A_inc1, unsigned int A_inc2,
1136  unsigned int A_size1, unsigned int A_size2,
1137  unsigned int A_internal_size1, unsigned int A_internal_size2,
1138 
1139  const T * B,
1140  unsigned int B_start1, unsigned int B_start2,
1141  unsigned int B_inc1, unsigned int B_inc2,
1142  unsigned int B_internal_size1, unsigned int B_internal_size2)
1143  {
1144  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1145  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1146 
1147  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1148  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1149  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sin(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1150  }
1151 
1152 
1153  // sinh
1154  template <typename T>
1156  T * A,
1157  unsigned int A_start1, unsigned int A_start2,
1158  unsigned int A_inc1, unsigned int A_inc2,
1159  unsigned int A_size1, unsigned int A_size2,
1160  unsigned int A_internal_size1, unsigned int A_internal_size2,
1161 
1162  const T * B,
1163  unsigned int B_start1, unsigned int B_start2,
1164  unsigned int B_inc1, unsigned int B_inc2,
1165  unsigned int B_internal_size1, unsigned int B_internal_size2)
1166  {
1167  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1168  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1169 
1170  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1171  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1172  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sinh(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1173  }
1174 
1175 
1176  // sqrt
1177  template <typename T>
1179  T * A,
1180  unsigned int A_start1, unsigned int A_start2,
1181  unsigned int A_inc1, unsigned int A_inc2,
1182  unsigned int A_size1, unsigned int A_size2,
1183  unsigned int A_internal_size1, unsigned int A_internal_size2,
1184 
1185  const T * B,
1186  unsigned int B_start1, unsigned int B_start2,
1187  unsigned int B_inc1, unsigned int B_inc2,
1188  unsigned int B_internal_size1, unsigned int B_internal_size2)
1189  {
1190  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1191  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1192 
1193  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1194  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1195  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sqrt(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1196  }
1197 
1198 
1199  // tan
1200  template <typename T>
1202  T * A,
1203  unsigned int A_start1, unsigned int A_start2,
1204  unsigned int A_inc1, unsigned int A_inc2,
1205  unsigned int A_size1, unsigned int A_size2,
1206  unsigned int A_internal_size1, unsigned int A_internal_size2,
1207 
1208  const T * B,
1209  unsigned int B_start1, unsigned int B_start2,
1210  unsigned int B_inc1, unsigned int B_inc2,
1211  unsigned int B_internal_size1, unsigned int B_internal_size2)
1212  {
1213  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1214  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1215 
1216  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1217  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1218  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = tan(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1219  }
1220 
1221 
1222  // tanh
1223  template <typename T>
1225  T * A,
1226  unsigned int A_start1, unsigned int A_start2,
1227  unsigned int A_inc1, unsigned int A_inc2,
1228  unsigned int A_size1, unsigned int A_size2,
1229  unsigned int A_internal_size1, unsigned int A_internal_size2,
1230 
1231  const T * B,
1232  unsigned int B_start1, unsigned int B_start2,
1233  unsigned int B_inc1, unsigned int B_inc2,
1234  unsigned int B_internal_size1, unsigned int B_internal_size2)
1235  {
1236  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1237  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1238 
1239  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1240  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1241  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = tanh(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1242  }
1243 
1244 
1245 
1246  //
1247  // matrix-vector product
1248  //
1249 
1250  template <typename T>
1251  __global__ void vec_mul_col_kernel(
1252  const T * A,
1253  unsigned int A_row_start,
1254  unsigned int A_col_start,
1255  unsigned int A_row_inc,
1256  unsigned int A_col_inc,
1257  unsigned int A_row_size,
1258  unsigned int A_col_size,
1259  unsigned int A_internal_rows,
1260  unsigned int A_internal_cols,
1261  const T * v,
1262  unsigned int v_start,
1263  unsigned int v_inc,
1264  unsigned int v_size,
1265  T * result,
1266  unsigned int result_start,
1267  unsigned int result_inc,
1268  unsigned int result_size)
1269  {
1270 
1271  for (unsigned int row = blockIdx.x * blockDim.x + threadIdx.x; row < A_row_size; row += gridDim.x * blockDim.x)
1272  {
1273  T dot_prod = 0;
1274  for (unsigned int col = 0; col < A_col_size; ++col)
1275  dot_prod += A[(row * A_row_inc + A_row_start) + (col * A_col_inc + A_col_start) * A_internal_rows] * v[v_start + v_inc * col];
1276  result[row * result_inc + result_start] = dot_prod;
1277  }
1278  }
1279 
1280 
1281  template <typename T>
1282  __global__ void trans_vec_mul_col_kernel(
1283  const T * A,
1284  unsigned int A_row_start,
1285  unsigned int A_col_start,
1286  unsigned int A_row_inc,
1287  unsigned int A_col_inc,
1288  unsigned int A_row_size,
1289  unsigned int A_col_size,
1290  unsigned int A_internal_rows,
1291  unsigned int A_internal_cols,
1292  const T * v,
1293  unsigned int v_start,
1294  unsigned int v_inc,
1295  unsigned int v_size,
1296  T * result,
1297  unsigned int result_start,
1298  unsigned int result_inc,
1299  unsigned int result_size)
1300  {
1301  __shared__ T work[128];
1302 
1303  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1304  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1305  unsigned int lid = threadIdx.x;
1306 
1307  for (unsigned int row = row_gid; row < A_col_size; row += gridDim.x)
1308  {
1309  T dot_prod = 0;
1310  for (unsigned int col = col_gid; col < A_row_size; col += blockDim.x)
1311  dot_prod += A[(row * A_col_inc + A_col_start) * A_internal_rows + col * A_row_inc + A_row_start] * v[v_start + v_inc * col];
1312  work[lid] = dot_prod;
1313 
1314  for(unsigned int stride = blockDim.x/2 ; stride>0 ; stride>>=1){
1315  __syncthreads();
1316  if(lid < stride)
1317  work[lid] += work[lid+stride];
1318  }
1319 
1320  if(lid == 0)
1321  result[row * result_inc + result_start] = work[0];
1322  }
1323  }
1324 
1325 
1326  //
1327  // matrix-matrix products
1328  //
1329 
1330 
1331 
1332 
1333  //
1334  // scaled rank-1-update
1335  //
1336 
1337  // alpha on CPU
1338  template <typename T>
1340  T * A,
1341  unsigned int A_start1, unsigned int A_start2,
1342  unsigned int A_inc1, unsigned int A_inc2,
1343  unsigned int A_size1, unsigned int A_size2,
1344  unsigned int A_internal_size1, unsigned int A_internal_size2,
1345 
1346  T val,
1347  unsigned int options2,
1348 
1349  const T * vec1,
1350  unsigned int start1,
1351  unsigned int inc1,
1352  unsigned int size1,
1353 
1354  const T * vec2,
1355  unsigned int start2,
1356  unsigned int inc2,
1357  unsigned int size2)
1358  {
1359  T alpha = val;
1360  if (options2 & (1 << 0))
1361  alpha = -alpha;
1362  if (options2 & (1 << 1))
1363  alpha = ((T)(1)) / alpha;
1364 
1365  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1366  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1367 
1368  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1369  {
1370  T tmp = alpha * vec1[row * inc1 + start1];
1371  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1372  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2];
1373  }
1374  }
1375 
1376 
1377  // alpha on GPU
1378  template <typename T>
1380  T * A,
1381  unsigned int A_start1, unsigned int A_start2,
1382  unsigned int A_inc1, unsigned int A_inc2,
1383  unsigned int A_size1, unsigned int A_size2,
1384  unsigned int A_internal_size1, unsigned int A_internal_size2,
1385 
1386  const T * val,
1387  unsigned int options2,
1388 
1389  const T * vec1,
1390  unsigned int start1,
1391  unsigned int inc1,
1392  unsigned int size1,
1393 
1394  const T * vec2,
1395  unsigned int start2,
1396  unsigned int inc2,
1397  unsigned int size2)
1398  {
1399  T alpha = *val;
1400  if (options2 & (1 << 0))
1401  alpha = -alpha;
1402  if (options2 & (1 << 1))
1403  alpha = ((T)(1)) / alpha;
1404 
1405  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1406  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1407 
1408  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1409  {
1410  T tmp = alpha * vec1[row * inc1 + start1];
1411  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1412  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2];
1413  }
1414  }
1415 
1416 
1417 
1418  } // namespace cuda
1419  } //namespace linalg
1420 } //namespace viennacl
1421 
1422 
1423 #endif
__global__ void scaled_rank1_update_col_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, T val, unsigned int options2, const T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const T *vec2, unsigned int start2, unsigned int inc2, unsigned int size2)
Definition: matrix_operations_col.hpp:1339
__global__ void matrix_col_element_log_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:1086
__global__ void matrix_col_element_abs_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:856
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:216
__global__ void matrix_col_diagonal_assign_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, T alpha)
Definition: matrix_operations_col.hpp:741
__global__ void matrix_col_element_atan_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:925
__global__ void matrix_col_element_acos_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:879
__global__ void matrix_col_element_floor_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:1063
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:46
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:64
__global__ void matrix_col_element_cos_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:971
__global__ void ambm_m_col_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, T fac2, unsigned int options2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, T fac3, unsigned int options3, const T *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
Definition: matrix_operations_col.hpp:420
__global__ void matrix_col_element_asin_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:902
__global__ void matrix_col_element_exp_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:1017
__global__ void matrix_col_element_ceil_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:948
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:245
__global__ void ambm_col_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, T fac2, unsigned int options2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, T fac3, unsigned int options3, const T *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
Definition: matrix_operations_col.hpp:117
__global__ void matrix_col_element_tanh_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:1224
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:29
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:83
__global__ void am_col_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, T fac2, unsigned int options2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:38
__global__ void vec_mul_col_kernel(const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, T *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
Definition: matrix_operations_col.hpp:1251
__global__ void matrix_col_element_fabs_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:1040
__global__ void matrix_col_element_tan_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:1201
__global__ void element_op_col_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const T *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
Definition: matrix_operations_col.hpp:760
__global__ void element_op_int_col_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const T *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
Definition: matrix_operations_col.hpp:809
__global__ void trans_vec_mul_col_kernel(const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, T *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
Definition: matrix_operations_col.hpp:1282
__global__ void matrix_col_element_log10_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:1109
__global__ void matrix_col_element_cosh_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:994
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:910
__global__ void matrix_col_element_sinh_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:1155
__global__ void matrix_col_element_sin_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:1132
void dot_prod(const MatrixType &A, unsigned int beg_ind, ScalarType &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
Definition: qr.hpp:154
__global__ void matrix_col_element_sqrt_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:1178
__global__ void matrix_col_assign_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, T alpha)
Definition: matrix_operations_col.hpp:723