ViennaCL - The Vienna Computing Library  1.5.2
matrix_operations_row.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_
2 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
26 namespace viennacl
27 {
28  namespace linalg
29  {
30  namespace cuda
31  {
32  //
33  // am
34  //
35 
36  // alpha on CPU
37  template <typename T>
38  __global__ void am_row_kernel(
39  T * A,
40  unsigned int A_start1, unsigned int A_start2,
41  unsigned int A_inc1, unsigned int A_inc2,
42  unsigned int A_size1, unsigned int A_size2,
43  unsigned int A_internal_size1, unsigned int A_internal_size2,
44 
45  T fac2,
46  unsigned int options2,
47  const T * B,
48  unsigned int B_start1, unsigned int B_start2,
49  unsigned int B_inc1, unsigned int B_inc2,
50  unsigned int B_internal_size1, unsigned int B_internal_size2)
51  {
52  T alpha = fac2;
53  if (options2 & (1 << 0))
54  alpha = -alpha;
55 
56  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
57  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
58 
59  if (options2 & (1 << 1))
60  {
61  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
62  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
63  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha;
64  }
65  else
66  {
67  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
68  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
69  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha;
70  }
71  }
72 
73  // alpha on GPU
74  template <typename T>
75  __global__ void am_row_kernel(
76  T * A,
77  unsigned int A_start1, unsigned int A_start2,
78  unsigned int A_inc1, unsigned int A_inc2,
79  unsigned int A_size1, unsigned int A_size2,
80  unsigned int A_internal_size1, unsigned int A_internal_size2,
81 
82  const T * fac2,
83  unsigned int options2,
84  const T * B,
85  unsigned int B_start1, unsigned int B_start2,
86  unsigned int B_inc1, unsigned int B_inc2,
87  unsigned int B_internal_size1, unsigned int B_internal_size2)
88  {
89  T alpha = *fac2;
90  if (options2 & (1 << 0))
91  alpha = -alpha;
92 
93  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
94  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
95 
96  if (options2 & (1 << 1))
97  {
98  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
99  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
100  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha;
101  }
102  else
103  {
104  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
105  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
106  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha;
107  }
108  }
109 
110 
111  //
112  // ambm
113  //
114 
115  // alpha and beta on CPU
116  template <typename T>
117  __global__ void ambm_row_kernel(
118  T * A,
119  unsigned int A_start1, unsigned int A_start2,
120  unsigned int A_inc1, unsigned int A_inc2,
121  unsigned int A_size1, unsigned int A_size2,
122  unsigned int A_internal_size1, unsigned int A_internal_size2,
123 
124  T fac2,
125  unsigned int options2,
126  const T * B,
127  unsigned int B_start1, unsigned int B_start2,
128  unsigned int B_inc1, unsigned int B_inc2,
129  unsigned int B_internal_size1, unsigned int B_internal_size2,
130 
131  T fac3,
132  unsigned int options3,
133  const T * C,
134  unsigned int C_start1, unsigned int C_start2,
135  unsigned int C_inc1, unsigned int C_inc2,
136  unsigned int C_internal_size1, unsigned int C_internal_size2)
137  {
138  T alpha = fac2;
139  if (options2 & (1 << 0))
140  alpha = -alpha;
141 
142  T beta = fac3;
143  if (options3 & (1 << 0))
144  beta = -beta;
145 
146  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
147  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
148 
149  if (options2 & (1 << 1))
150  {
151  if (options3 & (1 << 1))
152  {
153  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
154  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
155  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
156  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
157  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
158  }
159  else
160  {
161  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
162  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
163  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
164  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
165  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
166  }
167  }
168  else
169  {
170  if (options3 & (1 << 1))
171  {
172  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
173  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
174  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
175  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
176  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
177  }
178  else
179  {
180  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
181  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
182  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
183  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
184  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
185  }
186  }
187  }
188 
189 
190  // alpha on CPU, beta on GPU
191  template <typename T>
192  __global__ void ambm_row_kernel(
193  T * A,
194  unsigned int A_start1, unsigned int A_start2,
195  unsigned int A_inc1, unsigned int A_inc2,
196  unsigned int A_size1, unsigned int A_size2,
197  unsigned int A_internal_size1, unsigned int A_internal_size2,
198 
199  T fac2,
200  unsigned int options2,
201  const T * B,
202  unsigned int B_start1, unsigned int B_start2,
203  unsigned int B_inc1, unsigned int B_inc2,
204  unsigned int B_internal_size1, unsigned int B_internal_size2,
205 
206  const T * fac3,
207  unsigned int options3,
208  const T * C,
209  unsigned int C_start1, unsigned int C_start2,
210  unsigned int C_inc1, unsigned int C_inc2,
211  unsigned int C_internal_size1, unsigned int C_internal_size2)
212  {
213  T alpha = fac2;
214  if (options2 & (1 << 0))
215  alpha = -alpha;
216 
217  T beta = *fac3;
218  if (options3 & (1 << 0))
219  beta = -beta;
220 
221  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
222  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
223 
224  if (options2 & (1 << 1))
225  {
226  if (options3 & (1 << 1))
227  {
228  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
229  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
230  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
231  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
232  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
233  }
234  else
235  {
236  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
237  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
238  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
239  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
240  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
241  }
242  }
243  else
244  {
245  if (options3 & (1 << 1))
246  {
247  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
248  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
249  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
250  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
251  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
252  }
253  else
254  {
255  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
256  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
257  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
258  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
259  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
260  }
261  }
262  }
263 
264  // alpha on GPU, beta on CPU
265  template <typename T>
266  __global__ void ambm_row_kernel(
267  T * A,
268  unsigned int A_start1, unsigned int A_start2,
269  unsigned int A_inc1, unsigned int A_inc2,
270  unsigned int A_size1, unsigned int A_size2,
271  unsigned int A_internal_size1, unsigned int A_internal_size2,
272 
273  const T * fac2,
274  unsigned int options2,
275  const T * B,
276  unsigned int B_start1, unsigned int B_start2,
277  unsigned int B_inc1, unsigned int B_inc2,
278  unsigned int B_internal_size1, unsigned int B_internal_size2,
279 
280  T fac3,
281  unsigned int options3,
282  const T * C,
283  unsigned int C_start1, unsigned int C_start2,
284  unsigned int C_inc1, unsigned int C_inc2,
285  unsigned int C_internal_size1, unsigned int C_internal_size2)
286  {
287  T alpha = *fac2;
288  if (options2 & (1 << 0))
289  alpha = -alpha;
290 
291  T beta = fac3;
292  if (options3 & (1 << 0))
293  beta = -beta;
294 
295  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
296  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
297 
298  if (options2 & (1 << 1))
299  {
300  if (options3 & (1 << 1))
301  {
302  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
303  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
304  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
305  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
306  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
307  }
308  else
309  {
310  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
311  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
312  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
313  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
314  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
315  }
316  }
317  else
318  {
319  if (options3 & (1 << 1))
320  {
321  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
322  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
323  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
324  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
325  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
326  }
327  else
328  {
329  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
330  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
331  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
332  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
333  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
334  }
335  }
336  }
337 
338 
339  // alpha and beta on GPU
340  template <typename T>
341  __global__ void ambm_row_kernel(
342  T * A,
343  unsigned int A_start1, unsigned int A_start2,
344  unsigned int A_inc1, unsigned int A_inc2,
345  unsigned int A_size1, unsigned int A_size2,
346  unsigned int A_internal_size1, unsigned int A_internal_size2,
347 
348  const T * fac2,
349  unsigned int options2,
350  const T * B,
351  unsigned int B_start1, unsigned int B_start2,
352  unsigned int B_inc1, unsigned int B_inc2,
353  unsigned int B_internal_size1, unsigned int B_internal_size2,
354 
355  const T * fac3,
356  unsigned int options3,
357  const T * C,
358  unsigned int C_start1, unsigned int C_start2,
359  unsigned int C_inc1, unsigned int C_inc2,
360  unsigned int C_internal_size1, unsigned int C_internal_size2)
361  {
362  T alpha = *fac2;
363  if (options2 & (1 << 0))
364  alpha = -alpha;
365 
366  T beta = *fac3;
367  if (options3 & (1 << 0))
368  beta = -beta;
369 
370  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
371  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
372 
373  if (options2 & (1 << 1))
374  {
375  if (options3 & (1 << 1))
376  {
377  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
378  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
379  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
380  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
381  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
382  }
383  else
384  {
385  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
386  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
387  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
388  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
389  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
390  }
391  }
392  else
393  {
394  if (options3 & (1 << 1))
395  {
396  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
397  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
398  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
399  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
400  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
401  }
402  else
403  {
404  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
405  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
406  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
407  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
408  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
409  }
410  }
411  }
412 
413 
414  //
415  // ambm_m
416  //
417 
418  // alpha and beta on CPU
419  template <typename T>
420  __global__ void ambm_m_row_kernel(
421  T * A,
422  unsigned int A_start1, unsigned int A_start2,
423  unsigned int A_inc1, unsigned int A_inc2,
424  unsigned int A_size1, unsigned int A_size2,
425  unsigned int A_internal_size1, unsigned int A_internal_size2,
426 
427  T fac2,
428  unsigned int options2,
429  const T * B,
430  unsigned int B_start1, unsigned int B_start2,
431  unsigned int B_inc1, unsigned int B_inc2,
432  unsigned int B_internal_size1, unsigned int B_internal_size2,
433 
434  T fac3,
435  unsigned int options3,
436  const T * C,
437  unsigned int C_start1, unsigned int C_start2,
438  unsigned int C_inc1, unsigned int C_inc2,
439  unsigned int C_internal_size1, unsigned int C_internal_size2)
440  {
441  T alpha = fac2;
442  if (options2 & (1 << 0))
443  alpha = -alpha;
444 
445  T beta = fac3;
446  if (options3 & (1 << 0))
447  beta = -beta;
448 
449  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
450  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
451 
452  if (options2 & (1 << 1))
453  {
454  if (options3 & (1 << 1))
455  {
456  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
457  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
458  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
459  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
460  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
461  }
462  else
463  {
464  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
465  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
466  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
467  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
468  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
469  }
470  }
471  else
472  {
473  if (options3 & (1 << 1))
474  {
475  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
476  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
477  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
478  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
479  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
480  }
481  else
482  {
483  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
484  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
485  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
486  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
487  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
488  }
489  }
490  }
491 
492 
493  // alpha on CPU, beta on GPU
494  template <typename T>
495  __global__ void ambm_m_row_kernel(
496  T * A,
497  unsigned int A_start1, unsigned int A_start2,
498  unsigned int A_inc1, unsigned int A_inc2,
499  unsigned int A_size1, unsigned int A_size2,
500  unsigned int A_internal_size1, unsigned int A_internal_size2,
501 
502  T fac2,
503  unsigned int options2,
504  const T * B,
505  unsigned int B_start1, unsigned int B_start2,
506  unsigned int B_inc1, unsigned int B_inc2,
507  unsigned int B_internal_size1, unsigned int B_internal_size2,
508 
509  const T * fac3,
510  unsigned int options3,
511  const T * C,
512  unsigned int C_start1, unsigned int C_start2,
513  unsigned int C_inc1, unsigned int C_inc2,
514  unsigned int C_internal_size1, unsigned int C_internal_size2)
515  {
516  T alpha = fac2;
517  if (options2 & (1 << 0))
518  alpha = -alpha;
519 
520  T beta = *fac3;
521  if (options3 & (1 << 0))
522  beta = -beta;
523 
524  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
525  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
526 
527  if (options2 & (1 << 1))
528  {
529  if (options3 & (1 << 1))
530  {
531  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
532  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
533  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
534  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
535  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
536  }
537  else
538  {
539  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
540  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
541  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
542  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
543  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
544  }
545  }
546  else
547  {
548  if (options3 & (1 << 1))
549  {
550  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
551  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
552  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
553  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
554  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
555  }
556  else
557  {
558  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
559  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
560  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
561  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
562  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
563  }
564  }
565  }
566 
567  // alpha on GPU, beta on CPU
568  template <typename T>
569  __global__ void ambm_m_row_kernel(
570  T * A,
571  unsigned int A_start1, unsigned int A_start2,
572  unsigned int A_inc1, unsigned int A_inc2,
573  unsigned int A_size1, unsigned int A_size2,
574  unsigned int A_internal_size1, unsigned int A_internal_size2,
575 
576  const T * fac2,
577  unsigned int options2,
578  const T * B,
579  unsigned int B_start1, unsigned int B_start2,
580  unsigned int B_inc1, unsigned int B_inc2,
581  unsigned int B_internal_size1, unsigned int B_internal_size2,
582 
583  T fac3,
584  unsigned int options3,
585  const T * C,
586  unsigned int C_start1, unsigned int C_start2,
587  unsigned int C_inc1, unsigned int C_inc2,
588  unsigned int C_internal_size1, unsigned int C_internal_size2)
589  {
590  T alpha = *fac2;
591  if (options2 & (1 << 0))
592  alpha = -alpha;
593 
594  T beta = fac3;
595  if (options3 & (1 << 0))
596  beta = -beta;
597 
598  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
599  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
600 
601  if (options2 & (1 << 1))
602  {
603  if (options3 & (1 << 1))
604  {
605  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
606  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
607  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
608  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
609  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
610  }
611  else
612  {
613  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
614  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
615  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
616  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
617  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
618  }
619  }
620  else
621  {
622  if (options3 & (1 << 1))
623  {
624  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
625  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
626  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
627  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
628  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
629  }
630  else
631  {
632  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
633  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
634  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
635  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
636  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
637  }
638  }
639  }
640 
641 
642  // alpha and beta on GPU
643  template <typename T>
644  __global__ void ambm_m_row_kernel(
645  T * A,
646  unsigned int A_start1, unsigned int A_start2,
647  unsigned int A_inc1, unsigned int A_inc2,
648  unsigned int A_size1, unsigned int A_size2,
649  unsigned int A_internal_size1, unsigned int A_internal_size2,
650 
651  const T * fac2,
652  unsigned int options2,
653  const T * B,
654  unsigned int B_start1, unsigned int B_start2,
655  unsigned int B_inc1, unsigned int B_inc2,
656  unsigned int B_internal_size1, unsigned int B_internal_size2,
657 
658  const T * fac3,
659  unsigned int options3,
660  const T * C,
661  unsigned int C_start1, unsigned int C_start2,
662  unsigned int C_inc1, unsigned int C_inc2,
663  unsigned int C_internal_size1, unsigned int C_internal_size2)
664  {
665  T alpha = *fac2;
666  if (options2 & (1 << 0))
667  alpha = -alpha;
668 
669  T beta = *fac3;
670  if (options3 & (1 << 0))
671  beta = -beta;
672 
673  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
674  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
675 
676  if (options2 & (1 << 1))
677  {
678  if (options3 & (1 << 1))
679  {
680  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
681  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
682  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
683  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
684  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
685  }
686  else
687  {
688  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
689  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
690  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
691  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
692  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
693  }
694  }
695  else
696  {
697  if (options3 & (1 << 1))
698  {
699  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
700  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
701  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
702  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
703  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
704  }
705  else
706  {
707  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
708  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
709  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
710  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
711  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
712  }
713  }
714  }
715 
716  //
717  // assignments
718  //
719 
720  template <typename T>
721  __global__ void matrix_row_assign_kernel(
722  T * A,
723  unsigned int A_start1, unsigned int A_start2,
724  unsigned int A_inc1, unsigned int A_inc2,
725  unsigned int A_size1, unsigned int A_size2,
726  unsigned int A_internal_size1, unsigned int A_internal_size2,
727  T alpha)
728  {
729  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
730  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
731 
732  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
733  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
734  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = alpha;
735  }
736 
737 
738  template <typename T>
740  T * A,
741  unsigned int A_start1, unsigned int A_start2,
742  unsigned int A_inc1, unsigned int A_inc2,
743  unsigned int A_size1, unsigned int A_size2,
744  unsigned int A_internal_size1, unsigned int A_internal_size2,
745  T alpha)
746  {
747  unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x);
748 
749  for (unsigned int row = gid; row < A_size1; row += blockDim.x * gridDim.x)
750  A[(row * A_inc1 + A_start1) * A_internal_size2 + row * A_inc2 + A_start2] = alpha;
751  }
752 
753  //
754  // binary element-wise operations
755  //
756 
757  template <typename T>
758  __global__ void element_op_row_kernel(
759  T * A,
760  unsigned int A_start1, unsigned int A_start2,
761  unsigned int A_inc1, unsigned int A_inc2,
762  unsigned int A_size1, unsigned int A_size2,
763  unsigned int A_internal_size1, unsigned int A_internal_size2,
764 
765  const T * B,
766  unsigned int B_start1, unsigned int B_start2,
767  unsigned int B_inc1, unsigned int B_inc2,
768  unsigned int B_internal_size1, unsigned int B_internal_size2,
769 
770  const T * C,
771  unsigned int C_start1, unsigned int C_start2,
772  unsigned int C_inc1, unsigned int C_inc2,
773  unsigned int C_internal_size1, unsigned int C_internal_size2,
774 
775  unsigned int op_type) //0: product, 1: division, 2: pow
776  {
777  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
778  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
779 
780  if (op_type == 2)
781  {
782  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
783  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
784  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
785  = pow(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2],
786  C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2]);
787  }
788  else if (op_type == 1)
789  {
790  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
791  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
792  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
793  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
794  / C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
795  }
796  else if (op_type == 0)
797  {
798  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
799  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
800  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
801  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
802  * C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
803  }
804  }
805 
806  template <typename T>
807  __global__ void element_op_int_row_kernel(
808  T * A,
809  unsigned int A_start1, unsigned int A_start2,
810  unsigned int A_inc1, unsigned int A_inc2,
811  unsigned int A_size1, unsigned int A_size2,
812  unsigned int A_internal_size1, unsigned int A_internal_size2,
813 
814  const T * B,
815  unsigned int B_start1, unsigned int B_start2,
816  unsigned int B_inc1, unsigned int B_inc2,
817  unsigned int B_internal_size1, unsigned int B_internal_size2,
818 
819  const T * C,
820  unsigned int C_start1, unsigned int C_start2,
821  unsigned int C_inc1, unsigned int C_inc2,
822  unsigned int C_internal_size1, unsigned int C_internal_size2,
823 
824  unsigned int op_type) //0: product, 1: division, 2: pow
825  {
826  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
827  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
828 
829  if (op_type == 1)
830  {
831  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
832  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
833  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
834  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
835  / C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
836  }
837  else if (op_type == 0)
838  {
839  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
840  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
841  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
842  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
843  * C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
844  }
845  }
846 
847  //
848  // unary element-wise operations
849  //
850 
851  // abs
852  template <typename T>
854  T * A,
855  unsigned int A_start1, unsigned int A_start2,
856  unsigned int A_inc1, unsigned int A_inc2,
857  unsigned int A_size1, unsigned int A_size2,
858  unsigned int A_internal_size1, unsigned int A_internal_size2,
859 
860  const T * B,
861  unsigned int B_start1, unsigned int B_start2,
862  unsigned int B_inc1, unsigned int B_inc2,
863  unsigned int B_internal_size1, unsigned int B_internal_size2)
864  {
865  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
866  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
867 
868  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
869  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
870  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = abs(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
871  }
872 
873 
874  // acos
875  template <typename T>
877  T * A,
878  unsigned int A_start1, unsigned int A_start2,
879  unsigned int A_inc1, unsigned int A_inc2,
880  unsigned int A_size1, unsigned int A_size2,
881  unsigned int A_internal_size1, unsigned int A_internal_size2,
882 
883  const T * B,
884  unsigned int B_start1, unsigned int B_start2,
885  unsigned int B_inc1, unsigned int B_inc2,
886  unsigned int B_internal_size1, unsigned int B_internal_size2)
887  {
888  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
889  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
890 
891  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
892  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
893  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = acos(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
894  }
895 
896 
897  // asin
898  template <typename T>
900  T * A,
901  unsigned int A_start1, unsigned int A_start2,
902  unsigned int A_inc1, unsigned int A_inc2,
903  unsigned int A_size1, unsigned int A_size2,
904  unsigned int A_internal_size1, unsigned int A_internal_size2,
905 
906  const T * B,
907  unsigned int B_start1, unsigned int B_start2,
908  unsigned int B_inc1, unsigned int B_inc2,
909  unsigned int B_internal_size1, unsigned int B_internal_size2)
910  {
911  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
912  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
913 
914  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
915  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
916  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = asin(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
917  }
918 
919 
920  // atan
921  template <typename T>
923  T * A,
924  unsigned int A_start1, unsigned int A_start2,
925  unsigned int A_inc1, unsigned int A_inc2,
926  unsigned int A_size1, unsigned int A_size2,
927  unsigned int A_internal_size1, unsigned int A_internal_size2,
928 
929  const T * B,
930  unsigned int B_start1, unsigned int B_start2,
931  unsigned int B_inc1, unsigned int B_inc2,
932  unsigned int B_internal_size1, unsigned int B_internal_size2)
933  {
934  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
935  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
936 
937  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
938  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
939  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = atan(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
940  }
941 
942 
943  // ceil
944  template <typename T>
946  T * A,
947  unsigned int A_start1, unsigned int A_start2,
948  unsigned int A_inc1, unsigned int A_inc2,
949  unsigned int A_size1, unsigned int A_size2,
950  unsigned int A_internal_size1, unsigned int A_internal_size2,
951 
952  const T * B,
953  unsigned int B_start1, unsigned int B_start2,
954  unsigned int B_inc1, unsigned int B_inc2,
955  unsigned int B_internal_size1, unsigned int B_internal_size2)
956  {
957  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
958  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
959 
960  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
961  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
962  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = ceil(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
963  }
964 
965 
966  // cos
967  template <typename T>
969  T * A,
970  unsigned int A_start1, unsigned int A_start2,
971  unsigned int A_inc1, unsigned int A_inc2,
972  unsigned int A_size1, unsigned int A_size2,
973  unsigned int A_internal_size1, unsigned int A_internal_size2,
974 
975  const T * B,
976  unsigned int B_start1, unsigned int B_start2,
977  unsigned int B_inc1, unsigned int B_inc2,
978  unsigned int B_internal_size1, unsigned int B_internal_size2)
979  {
980  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
981  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
982 
983  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
984  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
985  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cos(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
986  }
987 
988 
989  // cosh
990  template <typename T>
992  T * A,
993  unsigned int A_start1, unsigned int A_start2,
994  unsigned int A_inc1, unsigned int A_inc2,
995  unsigned int A_size1, unsigned int A_size2,
996  unsigned int A_internal_size1, unsigned int A_internal_size2,
997 
998  const T * B,
999  unsigned int B_start1, unsigned int B_start2,
1000  unsigned int B_inc1, unsigned int B_inc2,
1001  unsigned int B_internal_size1, unsigned int B_internal_size2)
1002  {
1003  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1004  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1005 
1006  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1007  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1008  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cosh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1009  }
1010 
1011 
1012  // exp
1013  template <typename T>
1015  T * A,
1016  unsigned int A_start1, unsigned int A_start2,
1017  unsigned int A_inc1, unsigned int A_inc2,
1018  unsigned int A_size1, unsigned int A_size2,
1019  unsigned int A_internal_size1, unsigned int A_internal_size2,
1020 
1021  const T * B,
1022  unsigned int B_start1, unsigned int B_start2,
1023  unsigned int B_inc1, unsigned int B_inc2,
1024  unsigned int B_internal_size1, unsigned int B_internal_size2)
1025  {
1026  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1027  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1028 
1029  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1030  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1031  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = exp(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1032  }
1033 
1034 
1035  // fabs
1036  template <typename T>
1038  T * A,
1039  unsigned int A_start1, unsigned int A_start2,
1040  unsigned int A_inc1, unsigned int A_inc2,
1041  unsigned int A_size1, unsigned int A_size2,
1042  unsigned int A_internal_size1, unsigned int A_internal_size2,
1043 
1044  const T * B,
1045  unsigned int B_start1, unsigned int B_start2,
1046  unsigned int B_inc1, unsigned int B_inc2,
1047  unsigned int B_internal_size1, unsigned int B_internal_size2)
1048  {
1049  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1050  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1051 
1052  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1053  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1054  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = fabs(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1055  }
1056 
1057 
1058  // floor
1059  template <typename T>
1061  T * A,
1062  unsigned int A_start1, unsigned int A_start2,
1063  unsigned int A_inc1, unsigned int A_inc2,
1064  unsigned int A_size1, unsigned int A_size2,
1065  unsigned int A_internal_size1, unsigned int A_internal_size2,
1066 
1067  const T * B,
1068  unsigned int B_start1, unsigned int B_start2,
1069  unsigned int B_inc1, unsigned int B_inc2,
1070  unsigned int B_internal_size1, unsigned int B_internal_size2)
1071  {
1072  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1073  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1074 
1075  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1076  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1077  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = floor(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1078  }
1079 
1080 
1081  // log
1082  template <typename T>
1084  T * A,
1085  unsigned int A_start1, unsigned int A_start2,
1086  unsigned int A_inc1, unsigned int A_inc2,
1087  unsigned int A_size1, unsigned int A_size2,
1088  unsigned int A_internal_size1, unsigned int A_internal_size2,
1089 
1090  const T * B,
1091  unsigned int B_start1, unsigned int B_start2,
1092  unsigned int B_inc1, unsigned int B_inc2,
1093  unsigned int B_internal_size1, unsigned int B_internal_size2)
1094  {
1095  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1096  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1097 
1098  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1099  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1100  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1101  }
1102 
1103 
1104  // log10
1105  template <typename T>
1107  T * A,
1108  unsigned int A_start1, unsigned int A_start2,
1109  unsigned int A_inc1, unsigned int A_inc2,
1110  unsigned int A_size1, unsigned int A_size2,
1111  unsigned int A_internal_size1, unsigned int A_internal_size2,
1112 
1113  const T * B,
1114  unsigned int B_start1, unsigned int B_start2,
1115  unsigned int B_inc1, unsigned int B_inc2,
1116  unsigned int B_internal_size1, unsigned int B_internal_size2)
1117  {
1118  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1119  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1120 
1121  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1122  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1123  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log10(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1124  }
1125 
1126 
1127  // sin
1128  template <typename T>
1130  T * A,
1131  unsigned int A_start1, unsigned int A_start2,
1132  unsigned int A_inc1, unsigned int A_inc2,
1133  unsigned int A_size1, unsigned int A_size2,
1134  unsigned int A_internal_size1, unsigned int A_internal_size2,
1135 
1136  const T * B,
1137  unsigned int B_start1, unsigned int B_start2,
1138  unsigned int B_inc1, unsigned int B_inc2,
1139  unsigned int B_internal_size1, unsigned int B_internal_size2)
1140  {
1141  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1142  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1143 
1144  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1145  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1146  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sin(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1147  }
1148 
1149 
1150  // sinh
1151  template <typename T>
1153  T * A,
1154  unsigned int A_start1, unsigned int A_start2,
1155  unsigned int A_inc1, unsigned int A_inc2,
1156  unsigned int A_size1, unsigned int A_size2,
1157  unsigned int A_internal_size1, unsigned int A_internal_size2,
1158 
1159  const T * B,
1160  unsigned int B_start1, unsigned int B_start2,
1161  unsigned int B_inc1, unsigned int B_inc2,
1162  unsigned int B_internal_size1, unsigned int B_internal_size2)
1163  {
1164  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1165  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1166 
1167  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1168  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1169  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sinh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1170  }
1171 
1172 
1173  // sqrt
1174  template <typename T>
1176  T * A,
1177  unsigned int A_start1, unsigned int A_start2,
1178  unsigned int A_inc1, unsigned int A_inc2,
1179  unsigned int A_size1, unsigned int A_size2,
1180  unsigned int A_internal_size1, unsigned int A_internal_size2,
1181 
1182  const T * B,
1183  unsigned int B_start1, unsigned int B_start2,
1184  unsigned int B_inc1, unsigned int B_inc2,
1185  unsigned int B_internal_size1, unsigned int B_internal_size2)
1186  {
1187  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1188  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1189 
1190  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1191  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1192  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sqrt(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1193  }
1194 
1195 
1196  // tan
1197  template <typename T>
1199  T * A,
1200  unsigned int A_start1, unsigned int A_start2,
1201  unsigned int A_inc1, unsigned int A_inc2,
1202  unsigned int A_size1, unsigned int A_size2,
1203  unsigned int A_internal_size1, unsigned int A_internal_size2,
1204 
1205  const T * B,
1206  unsigned int B_start1, unsigned int B_start2,
1207  unsigned int B_inc1, unsigned int B_inc2,
1208  unsigned int B_internal_size1, unsigned int B_internal_size2)
1209  {
1210  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1211  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1212 
1213  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1214  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1215  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tan(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1216  }
1217 
1218 
1219  // tanh
1220  template <typename T>
1222  T * A,
1223  unsigned int A_start1, unsigned int A_start2,
1224  unsigned int A_inc1, unsigned int A_inc2,
1225  unsigned int A_size1, unsigned int A_size2,
1226  unsigned int A_internal_size1, unsigned int A_internal_size2,
1227 
1228  const T * B,
1229  unsigned int B_start1, unsigned int B_start2,
1230  unsigned int B_inc1, unsigned int B_inc2,
1231  unsigned int B_internal_size1, unsigned int B_internal_size2)
1232  {
1233  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1234  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1235 
1236  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1237  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1238  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tanh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1239  }
1240 
1241 
1242 
1243  //
1244  // matrix-vector product
1245  //
1246 
1247  template <typename T>
1248  __global__ void vec_mul_row_kernel(
1249  const T * A,
1250  unsigned int A_row_start,
1251  unsigned int A_col_start,
1252  unsigned int A_row_inc,
1253  unsigned int A_col_inc,
1254  unsigned int A_row_size,
1255  unsigned int A_col_size,
1256  unsigned int A_internal_rows,
1257  unsigned int A_internal_cols,
1258  const T * v,
1259  unsigned int v_start,
1260  unsigned int v_inc,
1261  unsigned int v_size,
1262  T * result,
1263  unsigned int result_start,
1264  unsigned int result_inc,
1265  unsigned int result_size)
1266  {
1267  __shared__ T work[128];
1268 
1269  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1270  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1271  unsigned int lid = threadIdx.x;
1272 
1273  for (unsigned int row = row_gid; row < A_row_size; row += gridDim.x)
1274  {
1275  T dot_prod = 0;
1276  for (unsigned int col = col_gid; col < A_col_size; col += blockDim.x)
1277  dot_prod += A[(row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col];
1278  work[lid] = dot_prod;
1279 
1280  for(unsigned int stride = blockDim.x/2 ; stride>0 ; stride>>=1){
1281  __syncthreads();
1282  if(lid < stride)
1283  work[lid] += work[lid+stride];
1284  }
1285 
1286  if(lid == 0)
1287  result[row * result_inc + result_start] = work[0];
1288  }
1289  }
1290 
1291 
1292  template <typename T>
1293  __global__ void trans_vec_mul_row_kernel(
1294  const T * A,
1295  unsigned int A_row_start,
1296  unsigned int A_col_start,
1297  unsigned int A_row_inc,
1298  unsigned int A_col_inc,
1299  unsigned int A_row_size,
1300  unsigned int A_col_size,
1301  unsigned int A_internal_rows,
1302  unsigned int A_internal_cols,
1303  const T * v,
1304  unsigned int v_start,
1305  unsigned int v_inc,
1306  unsigned int v_size,
1307  T * result,
1308  unsigned int result_start,
1309  unsigned int result_inc,
1310  unsigned int result_size)
1311  {
1312  for (unsigned int row = blockIdx.x * blockDim.x + threadIdx.x; row < A_col_size; row += gridDim.x * blockDim.x)
1313  {
1314  T dot_prod = 0;
1315  for (unsigned int col = 0; col < A_row_size; ++col)
1316  dot_prod += A[(row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col];
1317  result[row * result_inc + result_start] = dot_prod;
1318  }
1319  }
1320 
1321 
1322  //
1323  // matrix-matrix products
1324  //
1325 
1326 
1327 
1328 
1329  //
1330  // scaled rank-1-update
1331  //
1332 
1333  // alpha on CPU
1334  template <typename T>
1336  T * A,
1337  unsigned int A_start1, unsigned int A_start2,
1338  unsigned int A_inc1, unsigned int A_inc2,
1339  unsigned int A_size1, unsigned int A_size2,
1340  unsigned int A_internal_size1, unsigned int A_internal_size2,
1341 
1342  T val,
1343  unsigned int options2,
1344 
1345  const T * vec1,
1346  unsigned int start1,
1347  unsigned int inc1,
1348  unsigned int size1,
1349 
1350  const T * vec2,
1351  unsigned int start2,
1352  unsigned int inc2,
1353  unsigned int size2)
1354  {
1355  T alpha = val;
1356  if (options2 & (1 << 0))
1357  alpha = -alpha;
1358  if (options2 & (1 << 1))
1359  alpha = ((T)(1)) / alpha;
1360 
1361  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1362  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1363 
1364  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1365  {
1366  T tmp = alpha * vec1[row * inc1 + start1];
1367  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1368  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2];
1369  }
1370  }
1371 
1372 
1373  // alpha on GPU
1374  template <typename T>
1376  T * A,
1377  unsigned int A_start1, unsigned int A_start2,
1378  unsigned int A_inc1, unsigned int A_inc2,
1379  unsigned int A_size1, unsigned int A_size2,
1380  unsigned int A_internal_size1, unsigned int A_internal_size2,
1381 
1382  const T * val,
1383  unsigned int options2,
1384 
1385  const T * vec1,
1386  unsigned int start1,
1387  unsigned int inc1,
1388  unsigned int size1,
1389 
1390  const T * vec2,
1391  unsigned int start2,
1392  unsigned int inc2,
1393  unsigned int size2)
1394  {
1395  T alpha = *val;
1396  if (options2 & (1 << 0))
1397  alpha = -alpha;
1398  if (options2 & (1 << 1))
1399  alpha = ((T)(1)) / alpha;
1400 
1401  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1402  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1403 
1404  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1405  {
1406  T tmp = alpha * vec1[row * inc1 + start1];
1407  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1408  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2];
1409  }
1410  }
1411 
1412 
1413 
1414  } // namespace cuda
1415  } //namespace linalg
1416 } //namespace viennacl
1417 
1418 
1419 #endif
__global__ void trans_vec_mul_row_kernel(const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, T *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
Definition: matrix_operations_row.hpp:1293
__global__ void matrix_row_element_fabs_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:1037
__global__ void matrix_row_element_exp_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:1014
__global__ void matrix_row_element_cosh_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:991
__global__ void matrix_row_element_sinh_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:1152
__global__ void matrix_row_diagonal_assign_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, T alpha)
Definition: matrix_operations_row.hpp:739
__global__ void am_row_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, T fac2, unsigned int options2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:38
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:216
__global__ void matrix_row_element_acos_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:876
__global__ void element_op_int_row_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const T *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
Definition: matrix_operations_row.hpp:807
__global__ void matrix_row_element_ceil_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:945
__global__ void vec_mul_row_kernel(const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, T *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
Definition: matrix_operations_row.hpp:1248
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:46
__global__ void matrix_row_element_sin_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:1129
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:64
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:245
__global__ void matrix_row_element_atan_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:922
__global__ void matrix_row_assign_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, T alpha)
Definition: matrix_operations_row.hpp:721
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:29
__global__ void matrix_row_element_abs_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:853
__global__ void matrix_row_element_tanh_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:1221
__global__ void ambm_m_row_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, T fac2, unsigned int options2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, T fac3, unsigned int options3, const T *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
Definition: matrix_operations_row.hpp:420
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:83
__global__ void matrix_row_element_floor_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:1060
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:910
__global__ void matrix_row_element_sqrt_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:1175
__global__ void matrix_row_element_log10_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:1106
void dot_prod(const MatrixType &A, unsigned int beg_ind, ScalarType &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
Definition: qr.hpp:154
__global__ void matrix_row_element_tan_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:1198
__global__ void matrix_row_element_cos_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:968
__global__ void ambm_row_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, T fac2, unsigned int options2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, T fac3, unsigned int options3, const T *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
Definition: matrix_operations_row.hpp:117
__global__ void scaled_rank1_update_row_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, T val, unsigned int options2, const T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const T *vec2, unsigned int start2, unsigned int inc2, unsigned int size2)
Definition: matrix_operations_row.hpp:1335
__global__ void matrix_row_element_log_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:1083
__global__ void element_op_row_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const T *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
Definition: matrix_operations_row.hpp:758
__global__ void matrix_row_element_asin_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:899