ViennaCL - The Vienna Computing Library  1.5.2
direct_solve.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP
2 #define VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include "viennacl/forwards.h"
26 #include "viennacl/vector.hpp"
27 #include "viennacl/matrix.hpp"
28 
29 
31 
32 
33 namespace viennacl
34 {
35  namespace linalg
36  {
37  namespace cuda
38  {
39 
40  template <typename T>
42  const T * A,
43  unsigned int A_start1, unsigned int A_start2,
44  unsigned int A_inc1, unsigned int A_inc2,
45  unsigned int A_size1, unsigned int A_size2,
46  unsigned int A_internal_size1, unsigned int A_internal_size2,
47  bool row_major_A,
48  bool transpose_A,
49 
50  T * B,
51  unsigned int B_start1, unsigned int B_start2,
52  unsigned int B_inc1, unsigned int B_inc2,
53  unsigned int B_size1, unsigned int B_size2,
54  unsigned int B_internal_size1, unsigned int B_internal_size2,
55  bool row_major_B,
56  bool transpose_B,
57 
58  bool unit_diagonal)
59  {
60  T temp;
61  T entry_A;
62 
63  for (unsigned int row_cnt = 0; row_cnt < A_size1; ++row_cnt)
64  {
65  unsigned int row = A_size1 - 1 - row_cnt;
66 
67  if (!unit_diagonal)
68  {
69  __syncthreads();
70 
71  if (threadIdx.x == 0)
72  {
73  if (row_major_B && transpose_B)
74  B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (row * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
75  : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
76  else if (row_major_B && !transpose_B)
77  B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
78  : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
79  else if (!row_major_B && transpose_B)
80  B[(blockIdx.x * B_inc1 + B_start1) + (row * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
81  : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
82  else //if (!row_major_B && !transpose_B)
83  B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
84  : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
85  }
86  }
87 
88  __syncthreads();
89 
90  if (row_major_B && transpose_B)
91  temp = B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (row * B_inc2 + B_start2)];
92  else if (row_major_B && !transpose_B)
93  temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)];
94  else if (!row_major_B && transpose_B)
95  temp = B[(blockIdx.x * B_inc1 + B_start1) + (row * B_inc2 + B_start2) * B_internal_size1];
96  else //if (!row_major_B && !transpose_B)
97  temp = B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1];
98 
99  //eliminate column of op(A) with index 'row' in parallel: " << std::endl;
100  for (unsigned int elim = threadIdx.x; elim < row; elim += blockDim.x)
101  {
102  if (row_major_A && transpose_A)
103  entry_A = A[(row * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)];
104  else if (row_major_A && !transpose_A)
105  entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
106  else if (!row_major_A && transpose_A)
107  entry_A = A[(row * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1];
108  else //if (!row_major_A && !transpose_A)
109  entry_A = A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
110 
111  if (row_major_B && transpose_B)
112  B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (elim * B_inc2 + B_start2)] -= temp * entry_A;
113  else if (row_major_B && !transpose_B)
114  B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] -= temp * entry_A;
115  else if (!row_major_B && transpose_B)
116  B[(blockIdx.x * B_inc1 + B_start1) + (elim * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
117  else //if (!row_major_B && !transpose_B)
118  B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
119 
120  }
121  }
122  }
123 
124 
125 
126  template <typename T>
128  const T * A,
129  unsigned int A_start1, unsigned int A_start2,
130  unsigned int A_inc1, unsigned int A_inc2,
131  unsigned int A_size1, unsigned int A_size2,
132  unsigned int A_internal_size1, unsigned int A_internal_size2,
133  bool row_major_A,
134  bool transpose_A,
135 
136  T * B,
137  unsigned int B_start1, unsigned int B_start2,
138  unsigned int B_inc1, unsigned int B_inc2,
139  unsigned int B_size1, unsigned int B_size2,
140  unsigned int B_internal_size1, unsigned int B_internal_size2,
141  bool row_major_B,
142  bool transpose_B,
143 
144  bool unit_diagonal)
145  {
146  T temp;
147  T entry_A;
148 
149  for (unsigned int row = 0; row < A_size1; ++row)
150  {
151 
152  if (!unit_diagonal)
153  {
154  __syncthreads();
155 
156  if (threadIdx.x == 0)
157  {
158  if (row_major_B && transpose_B)
159  B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (row * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
160  : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
161  else if (row_major_B && !transpose_B)
162  B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
163  : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
164  else if (!row_major_B && transpose_B)
165  B[(blockIdx.x * B_inc1 + B_start1) + (row * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
166  : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
167  else //if (!row_major_B && !transpose_B)
168  B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
169  : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
170  }
171  }
172 
173  __syncthreads();
174 
175  if (row_major_B && transpose_B)
176  temp = B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (row * B_inc2 + B_start2)];
177  else if (row_major_B && !transpose_B)
178  temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)];
179  else if (!row_major_B && transpose_B)
180  temp = B[(blockIdx.x * B_inc1 + B_start1) + (row * B_inc2 + B_start2) * B_internal_size1];
181  else //if (!row_major_B && !transpose_B)
182  temp = B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1];
183 
184  //eliminate column of op(A) with index 'row' in parallel: " << std::endl;
185  for (unsigned int elim = row + threadIdx.x + 1; elim < A_size1; elim += blockDim.x)
186  {
187  if (row_major_A && transpose_A)
188  entry_A = A[(row * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)];
189  else if (row_major_A && !transpose_A)
190  entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
191  else if (!row_major_A && transpose_A)
192  entry_A = A[(row * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1];
193  else //if (!row_major_A && !transpose_A)
194  entry_A = A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
195 
196  if (row_major_B && transpose_B)
197  B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (elim * B_inc2 + B_start2)] -= temp * entry_A;
198  else if (row_major_B && !transpose_B)
199  B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] -= temp * entry_A;
200  else if (!row_major_B && transpose_B)
201  B[(blockIdx.x * B_inc1 + B_start1) + (elim * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
202  else //if (!row_major_B && !transpose_B)
203  B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
204 
205  }
206  }
207  }
208 
209 
210 
211 
212 
213 
214  namespace detail
215  {
216  template <typename T>
217  bool is_unit_solve(T const & tag) { return false; }
218 
219  inline bool is_unit_solve(viennacl::linalg::unit_lower_tag) { return true; }
220  inline bool is_unit_solve(viennacl::linalg::unit_upper_tag) { return true; }
221 
222  template <typename T>
223  bool is_upper_solve(T const & tag) { return false; }
224 
225  inline bool is_upper_solve(viennacl::linalg::upper_tag) { return true; }
226  inline bool is_upper_solve(viennacl::linalg::unit_upper_tag) { return true; }
227 
228  template <typename M1, typename M2, typename SolverTag>
229  void inplace_solve_impl(M1 const & A, bool transpose_A,
230  M2 & B, bool transpose_B,
231  SolverTag const & tag)
232  {
233  typedef typename viennacl::result_of::cpu_value_type<M1>::type value_type;
234 
235  dim3 threads(128);
236  dim3 grid( transpose_B ? B.size1() : B.size2() );
237 
238  if (is_upper_solve(tag))
239  {
240  matrix_matrix_upper_solve_kernel<<<grid,threads>>>(detail::cuda_arg<value_type>(A),
241  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
242  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
243  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
244  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
246  transpose_A,
247 
248  detail::cuda_arg<value_type>(B),
249  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
250  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
251  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
252  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
254  transpose_B,
255 
256  is_unit_solve(tag)
257  );
258  }
259  else
260  {
261  matrix_matrix_lower_solve_kernel<<<grid,threads>>>(detail::cuda_arg<value_type>(A),
262  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
263  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
264  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
265  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
267  transpose_A,
268 
269  detail::cuda_arg<value_type>(B),
270  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
271  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
272  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
273  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
275  transpose_B,
276 
277  is_unit_solve(tag)
278  );
279  }
280 
281  }
282  }
283 
284 
285  //
286  // Note: By convention, all size checks are performed in the calling frontend. No need to double-check here.
287  //
288 
290 
296  template <typename NumericT, typename F1, typename F2, typename SOLVERTAG>
298  {
300  B, false, tag);
301  }
302 
309  template <typename NumericT, typename F1, typename F2, typename SOLVERTAG>
312  SOLVERTAG tag)
313  {
315  const_cast<matrix_base<NumericT, F2> &>(proxy_B.lhs()), true, tag);
316  }
317 
318  //upper triangular solver for transposed lower triangular matrices
325  template <typename NumericT, typename F1, typename F2, typename SOLVERTAG>
328  SOLVERTAG tag)
329  {
330  detail::inplace_solve_impl(const_cast<matrix_base<NumericT, F1> &>(proxy_A.lhs()), true,
331  B, false, tag);
332  }
333 
340  template <typename NumericT, typename F1, typename F2, typename SOLVERTAG>
343  SOLVERTAG tag)
344  {
345  detail::inplace_solve_impl(const_cast<matrix_base<NumericT, F1> &>(proxy_A.lhs()), true,
346  const_cast<matrix_base<NumericT, F2> &>(proxy_B.lhs()), true, tag);
347  }
348 
349 
350 
351  //
352  // Solve on vector
353  //
354 
355  template <typename T>
357  T const * A,
358  unsigned int A_start1, unsigned int A_start2,
359  unsigned int A_inc1, unsigned int A_inc2,
360  unsigned int A_size1, unsigned int A_size2,
361  unsigned int A_internal_size1, unsigned int A_internal_size2,
362  T * v,
363  unsigned int v_start,
364  unsigned int v_inc,
365  unsigned int v_size,
366 
367  unsigned int options)
368  {
369  T temp;
370  unsigned int unit_diagonal_flag = (options & (1 << 0));
371  unsigned int transposed_access_A = (options & (1 << 1));
372  unsigned int is_lower_solve = (options & (1 << 2));
373  unsigned int row;
374  for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed) //Note: A required to be square
375  {
376  row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1);
377  if (!unit_diagonal_flag)
378  {
379  __syncthreads();
380  if (threadIdx.x == 0)
381  v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
382  }
383 
384  __syncthreads();
385 
386  temp = v[row * v_inc + v_start];
387 
388  for (int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x);
389  elim < (is_lower_solve ? A_size1 : row);
390  elim += blockDim.x)
391  v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2))
392  : ((elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2))];
393  }
394  }
395 
396 
397  template <typename T>
399  T const * A,
400  unsigned int A_start1, unsigned int A_start2,
401  unsigned int A_inc1, unsigned int A_inc2,
402  unsigned int A_size1, unsigned int A_size2,
403  unsigned int A_internal_size1, unsigned int A_internal_size2,
404  T * v,
405  unsigned int v_start,
406  unsigned int v_inc,
407  unsigned int v_size,
408  unsigned int options)
409  {
410  T temp;
411  unsigned int unit_diagonal_flag = (options & (1 << 0));
412  unsigned int transposed_access_A = (options & (1 << 1));
413  unsigned int is_lower_solve = (options & (1 << 2));
414  unsigned int row;
415  for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed) //Note: A required to be square
416  {
417  row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1);
418  if (!unit_diagonal_flag)
419  {
420  __syncthreads();
421  if (threadIdx.x == 0)
422  v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
423  }
424 
425  __syncthreads();
426 
427  temp = v[row * v_inc + v_start];
428 
429  for (int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x);
430  elim < (is_lower_solve ? A_size1 : row);
431  elim += blockDim.x)
432  v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1)
433  : ((elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1)];
434  }
435  }
436 
437 
438  namespace detail
439  {
440  inline unsigned int get_option_for_solver_tag(viennacl::linalg::upper_tag) { return 0; }
441  inline unsigned int get_option_for_solver_tag(viennacl::linalg::unit_upper_tag) { return (1 << 0); }
442  inline unsigned int get_option_for_solver_tag(viennacl::linalg::lower_tag) { return (1 << 2); }
443  inline unsigned int get_option_for_solver_tag(viennacl::linalg::unit_lower_tag) { return (1 << 2) | (1 << 0); }
444 
445  template <typename MatrixType, typename VectorType>
446  void inplace_solve_vector_impl(MatrixType const & mat,
447  VectorType & vec,
448  unsigned int options)
449  {
450  typedef typename viennacl::result_of::cpu_value_type<MatrixType>::type value_type;
451 
453  {
454  triangular_substitute_inplace_row_kernel<<<1, 128>>>(detail::cuda_arg<value_type>(mat),
455  static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
456  static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
457  static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)),
458  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
459  detail::cuda_arg<value_type>(vec),
460  static_cast<unsigned int>(viennacl::traits::start(vec)),
461  static_cast<unsigned int>(viennacl::traits::stride(vec)),
462  static_cast<unsigned int>(viennacl::traits::size(vec)),
463  options
464  );
465  }
466  else
467  {
468  triangular_substitute_inplace_col_kernel<<<1, 128>>>(detail::cuda_arg<value_type>(mat),
469  static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
470  static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
471  static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)),
472  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
473  detail::cuda_arg<value_type>(vec),
474  static_cast<unsigned int>(viennacl::traits::start(vec)),
475  static_cast<unsigned int>(viennacl::traits::stride(vec)),
476  static_cast<unsigned int>(viennacl::traits::size(vec)),
477  options
478  );
479  }
480  }
481 
482  }
483 
489  template <typename NumericT, typename F, typename SOLVERTAG>
491  vector_base<NumericT> & vec,
492  SOLVERTAG)
493  {
494  unsigned int options = detail::get_option_for_solver_tag(SOLVERTAG());
495 
496  detail::inplace_solve_vector_impl(mat, vec, options);
497  }
498 
499 
500 
501 
507  template <typename NumericT, typename F, typename SOLVERTAG>
509  vector_base<NumericT> & vec,
510  SOLVERTAG)
511  {
512  unsigned int options = detail::get_option_for_solver_tag(SOLVERTAG()) | 0x02; //add transpose-flag
513 
514  detail::inplace_solve_vector_impl(proxy.lhs(), vec, options);
515  }
516 
517 
518 
519  }
520  }
521 }
522 
523 #endif
result_of::size_type< matrix_base< NumericT, F > >::type stride2(matrix_base< NumericT, F > const &s)
Definition: stride.hpp:68
__global__ void triangular_substitute_inplace_col_kernel(T const *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, T *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, unsigned int options)
Definition: direct_solve.hpp:398
__global__ void matrix_matrix_upper_solve_kernel(const T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, bool row_major_A, bool transpose_A, T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_size1, unsigned int B_size2, unsigned int B_internal_size1, unsigned int B_internal_size2, bool row_major_B, bool transpose_B, bool unit_diagonal)
Definition: direct_solve.hpp:41
Common routines for CUDA execution.
Helper class for checking whether a matrix has a row-major layout.
Definition: forwards.h:399
Implementation of the dense matrix class.
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:216
A tag class representing a lower triangular matrix.
Definition: forwards.h:703
void inplace_solve(const matrix_base< NumericT, F1 > &A, matrix_base< NumericT, F2 > &B, SOLVERTAG tag)
Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notat...
Definition: direct_solve.hpp:297
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:46
Common base class for dense vectors, vector ranges, and vector slices.
Definition: forwards.h:205
This file provides the forward declarations for the main types used within ViennaCL.
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:64
__global__ void matrix_matrix_lower_solve_kernel(const T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, bool row_major_A, bool transpose_A, T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_size1, unsigned int B_size2, unsigned int B_internal_size1, unsigned int B_internal_size2, bool row_major_B, bool transpose_B, bool unit_diagonal)
Definition: direct_solve.hpp:127
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:245
A dense matrix class.
Definition: forwards.h:290
result_of::size_type< matrix_base< NumericT, F > >::type stride1(matrix_base< NumericT, F > const &s)
Definition: stride.hpp:57
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:29
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:144
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:83
bool is_unit_solve(T const &tag)
Definition: direct_solve.hpp:217
void inplace_solve_vector_impl(MatrixType const &mat, VectorType &vec, unsigned int options)
Definition: direct_solve.hpp:446
__global__ void triangular_substitute_inplace_row_kernel(T const *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, T *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, unsigned int options)
Definition: direct_solve.hpp:356
A tag class representing an upper triangular matrix.
Definition: forwards.h:708
result_of::size_type< T >::type start(T const &obj)
Definition: start.hpp:43
void inplace_solve_impl(M1 const &A, bool transpose_A, M2 &B, bool transpose_B, SolverTag const &tag)
Definition: direct_solve.hpp:229
Expression template class for representing a tree of expressions which ultimately result in a matrix...
Definition: forwards.h:283
vcl_size_t internal_size2(matrix_base< NumericT, F > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
Definition: size.hpp:287
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:910
T::ERROR_CANNOT_DEDUCE_CPU_SCALAR_TYPE_FOR_T type
Definition: result_of.hpp:276
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
A tag class representing a lower triangular matrix with unit diagonal.
Definition: forwards.h:713
A tag class representing transposed matrices.
Definition: forwards.h:165
unsigned int get_option_for_solver_tag(viennacl::linalg::upper_tag)
Definition: direct_solve.hpp:440
bool is_upper_solve(T const &tag)
Definition: direct_solve.hpp:223
vcl_size_t internal_size1(matrix_base< NumericT, F > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
Definition: size.hpp:279
A tag class representing an upper triangular matrix with unit diagonal.
Definition: forwards.h:718