Actual source code: sbaijfact.c
1: #define PETSCMAT_DLL
3: #include src/mat/impls/baij/seq/baij.h
4: #include src/mat/impls/sbaij/seq/sbaij.h
5: #include src/inline/ilu.h
6: #include include/petscis.h
8: #if !defined(PETSC_USE_COMPLEX)
9: /*
10: input:
11: F -- numeric factor
12: output:
13: nneg, nzero, npos: matrix inertia
14: */
18: PetscErrorCode MatGetInertia_SeqSBAIJ(Mat F,PetscInt *nneig,PetscInt *nzero,PetscInt *npos)
19: {
20: Mat_SeqSBAIJ *fact_ptr = (Mat_SeqSBAIJ*)F->data;
21: PetscScalar *dd = fact_ptr->a;
22: PetscInt mbs=fact_ptr->mbs,bs=F->rmap.bs,i,nneig_tmp,npos_tmp,*fi = fact_ptr->i;
25: if (bs != 1) SETERRQ1(PETSC_ERR_SUP,"No support for bs: %D >1 yet",bs);
26: nneig_tmp = 0; npos_tmp = 0;
27: for (i=0; i<mbs; i++){
28: if (PetscRealPart(dd[*fi]) > 0.0){
29: npos_tmp++;
30: } else if (PetscRealPart(dd[*fi]) < 0.0){
31: nneig_tmp++;
32: }
33: fi++;
34: }
35: if (nneig) *nneig = nneig_tmp;
36: if (npos) *npos = npos_tmp;
37: if (nzero) *nzero = mbs - nneig_tmp - npos_tmp;
39: return(0);
40: }
41: #endif /* !defined(PETSC_USE_COMPLEX) */
43: /*
44: Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
45: Use Modified Sparse Row (MSR) storage for u and ju. See page 85, "Iterative Methods ..." by Saad.
46: */
49: PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(Mat A,IS perm,MatFactorInfo *info,Mat *B)
50: {
51: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
53: PetscInt *rip,i,mbs = a->mbs,*ai,*aj;
54: PetscInt *jutmp,bs = A->rmap.bs,bs2=a->bs2;
55: PetscInt m,reallocs = 0,prow;
56: PetscInt *jl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
57: PetscReal f = info->fill;
58: PetscTruth perm_identity;
61: /* check whether perm is the identity mapping */
62: ISIdentity(perm,&perm_identity);
63: ISGetIndices(perm,&rip);
64:
65: if (perm_identity){ /* without permutation */
66: a->permute = PETSC_FALSE;
67: ai = a->i; aj = a->j;
68: } else { /* non-trivial permutation */
69: a->permute = PETSC_TRUE;
70: MatReorderingSeqSBAIJ(A,perm);
71: ai = a->inew; aj = a->jnew;
72: }
73:
74: /* initialization */
75: PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);
76: umax = (PetscInt)(f*ai[mbs] + 1); umax += mbs + 1;
77: PetscMalloc(umax*sizeof(PetscInt),&ju);
78: iu[0] = mbs+1;
79: juidx = mbs + 1; /* index for ju */
80: PetscMalloc(2*mbs*sizeof(PetscInt),&jl); /* linked list for pivot row */
81: q = jl + mbs; /* linked list for col index */
82: for (i=0; i<mbs; i++){
83: jl[i] = mbs;
84: q[i] = 0;
85: }
87: /* for each row k */
88: for (k=0; k<mbs; k++){
89: for (i=0; i<mbs; i++) q[i] = 0; /* to be removed! */
90: nzk = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
91: q[k] = mbs;
92: /* initialize nonzero structure of k-th row to row rip[k] of A */
93: jmin = ai[rip[k]] +1; /* exclude diag[k] */
94: jmax = ai[rip[k]+1];
95: for (j=jmin; j<jmax; j++){
96: vj = rip[aj[j]]; /* col. value */
97: if(vj > k){
98: qm = k;
99: do {
100: m = qm; qm = q[m];
101: } while(qm < vj);
102: if (qm == vj) {
103: SETERRQ(PETSC_ERR_PLIB,"Duplicate entry in A\n");
104: }
105: nzk++;
106: q[m] = vj;
107: q[vj] = qm;
108: } /* if(vj > k) */
109: } /* for (j=jmin; j<jmax; j++) */
111: /* modify nonzero structure of k-th row by computing fill-in
112: for each row i to be merged in */
113: prow = k;
114: prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
115:
116: while (prow < k){
117: /* merge row prow into k-th row */
118: jmin = iu[prow] + 1; jmax = iu[prow+1];
119: qm = k;
120: for (j=jmin; j<jmax; j++){
121: vj = ju[j];
122: do {
123: m = qm; qm = q[m];
124: } while (qm < vj);
125: if (qm != vj){
126: nzk++; q[m] = vj; q[vj] = qm; qm = vj;
127: }
128: }
129: prow = jl[prow]; /* next pivot row */
130: }
131:
132: /* add k to row list for first nonzero element in k-th row */
133: if (nzk > 0){
134: i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
135: jl[k] = jl[i]; jl[i] = k;
136: }
137: iu[k+1] = iu[k] + nzk;
139: /* allocate more space to ju if needed */
140: if (iu[k+1] > umax) {
141: /* estimate how much additional space we will need */
142: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
143: /* just double the memory each time */
144: maxadd = umax;
145: if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
146: umax += maxadd;
148: /* allocate a longer ju */
149: PetscMalloc(umax*sizeof(PetscInt),&jutmp);
150: PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));
151: PetscFree(ju);
152: ju = jutmp;
153: reallocs++; /* count how many times we realloc */
154: }
156: /* save nonzero structure of k-th row in ju */
157: i=k;
158: while (nzk --) {
159: i = q[i];
160: ju[juidx++] = i;
161: }
162: }
164: #if defined(PETSC_USE_INFO)
165: if (ai[mbs] != 0) {
166: PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
167: PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);
168: PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
169: PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);
170: PetscInfo(A,"for best performance.\n");
171: } else {
172: PetscInfo(A,"Empty matrix.\n");
173: }
174: #endif
176: ISRestoreIndices(perm,&rip);
177: PetscFree(jl);
179: /* put together the new matrix */
180: MatCreate(A->comm,B);
181: MatSetSizes(*B,bs*mbs,bs*mbs,bs*mbs,bs*mbs);
182: MatSetType(*B,A->type_name);
183: MatSeqSBAIJSetPreallocation_SeqSBAIJ(*B,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);
185: /* PetscLogObjectParent(*B,iperm); */
186: b = (Mat_SeqSBAIJ*)(*B)->data;
187: b->singlemalloc = PETSC_FALSE;
188: b->free_a = PETSC_TRUE;
189: b->free_ij = PETSC_TRUE;
190: PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
191: b->j = ju;
192: b->i = iu;
193: b->diag = 0;
194: b->ilen = 0;
195: b->imax = 0;
196: b->row = perm;
197: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
198: PetscObjectReference((PetscObject)perm);
199: b->icol = perm;
200: PetscObjectReference((PetscObject)perm);
201: PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);
202: /* In b structure: Free imax, ilen, old a, old j.
203: Allocate idnew, solve_work, new a, new j */
204: PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
205: b->maxnz = b->nz = iu[mbs];
206:
207: (*B)->factor = FACTOR_CHOLESKY;
208: (*B)->info.factor_mallocs = reallocs;
209: (*B)->info.fill_ratio_given = f;
210: if (ai[mbs] != 0) {
211: (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
212: } else {
213: (*B)->info.fill_ratio_needed = 0.0;
214: }
216: if (perm_identity){
217: switch (bs) {
218: case 1:
219: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
220: (*B)->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering;
221: (*B)->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_1_NaturalOrdering;
222: (*B)->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering;
223: PetscInfo(A,"Using special in-place natural ordering factor and solve BS=1\n");
224: break;
225: case 2:
226: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
227: (*B)->ops->solve = MatSolve_SeqSBAIJ_2_NaturalOrdering;
228: (*B)->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering;
229: (*B)->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering;
230: PetscInfo(A,"Using special in-place natural ordering factor and solve BS=2\n");
231: break;
232: case 3:
233: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
234: (*B)->ops->solve = MatSolve_SeqSBAIJ_3_NaturalOrdering;
235: (*B)->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_3_NaturalOrdering;
236: (*B)->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering;
237: PetscInfo(A,"Using special in-place natural ordering factor and solve BS=3\n");
238: break;
239: case 4:
240: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
241: (*B)->ops->solve = MatSolve_SeqSBAIJ_4_NaturalOrdering;
242: (*B)->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_4_NaturalOrdering;
243: (*B)->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering;
244: PetscInfo(A,"Using special in-place natural ordering factor and solve BS=4\n");
245: break;
246: case 5:
247: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
248: (*B)->ops->solve = MatSolve_SeqSBAIJ_5_NaturalOrdering;
249: (*B)->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering;
250: (*B)->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering;
251: PetscInfo(A,"Using special in-place natural ordering factor and solve BS=5\n");
252: break;
253: case 6:
254: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
255: (*B)->ops->solve = MatSolve_SeqSBAIJ_6_NaturalOrdering;
256: (*B)->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_6_NaturalOrdering;
257: (*B)->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering;
258: PetscInfo(A,"Using special in-place natural ordering factor and solve BS=6\n");
259: break;
260: case 7:
261: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
262: (*B)->ops->solve = MatSolve_SeqSBAIJ_7_NaturalOrdering;
263: (*B)->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_7_NaturalOrdering;
264: (*B)->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering;
265: PetscInfo(A,"Using special in-place natural ordering factor and solve BS=7\n");
266: break;
267: default:
268: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
269: (*B)->ops->solve = MatSolve_SeqSBAIJ_N_NaturalOrdering;
270: (*B)->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering;
271: (*B)->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering;
272: PetscInfo(A,"Using special in-place natural ordering factor and solve BS>7\n");
273: break;
274: }
275: }
276: return(0);
277: }
278: /*
279: Symbolic U^T*D*U factorization for SBAIJ format.
280: */
281: #include petscbt.h
282: #include src/mat/utils/freespace.h
285: PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info,Mat *fact)
286: {
287: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
288: Mat_SeqSBAIJ *b;
289: Mat B;
290: PetscErrorCode ierr;
291: PetscTruth perm_identity;
292: PetscReal fill = info->fill;
293: PetscInt *rip,i,mbs=a->mbs,bs=A->rmap.bs,*ai,*aj,reallocs=0,prow;
294: PetscInt *jl,jmin,jmax,nzk,*ui,k,j,*il,nextprow;
295: PetscInt nlnk,*lnk,ncols,*cols,*uj,**ui_ptr,*uj_ptr;
296: PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
297: PetscBT lnkbt;
300: /*
301: This code originally uses Modified Sparse Row (MSR) storage
302: (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choise!
303: Then it is rewritten so the factor B takes seqsbaij format. However the associated
304: MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1 or !perm_identity,
305: thus the original code in MSR format is still used for these cases.
306: The code below should replace MatCholeskyFactorSymbolic_SeqSBAIJ_MSR() whenever
307: MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
308: */
309: if (bs > 1){
310: MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(A,perm,info,fact);
311: return(0);
312: }
314: /* check whether perm is the identity mapping */
315: ISIdentity(perm,&perm_identity);
317: if (perm_identity){
318: a->permute = PETSC_FALSE;
319: ai = a->i; aj = a->j;
320: } else {
321: SETERRQ(PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
322: /* There are bugs for reordeing. Needs further work.
323: MatReordering for sbaij cannot be efficient. User should use aij formt! */
324: a->permute = PETSC_TRUE;
325: MatReorderingSeqSBAIJ(A,perm);
326: ai = a->inew; aj = a->jnew;
327: }
328: ISGetIndices(perm,&rip);
330: /* initialization */
331: PetscMalloc((mbs+1)*sizeof(PetscInt),&ui);
332: ui[0] = 0;
334: /* jl: linked list for storing indices of the pivot rows
335: il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
336: PetscMalloc((3*mbs+1)*sizeof(PetscInt)+mbs*sizeof(PetscInt*),&jl);
337: il = jl + mbs;
338: cols = il + mbs;
339: ui_ptr = (PetscInt**)(cols + mbs);
340:
341: for (i=0; i<mbs; i++){
342: jl[i] = mbs; il[i] = 0;
343: }
345: /* create and initialize a linked list for storing column indices of the active row k */
346: nlnk = mbs + 1;
347: PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);
349: /* initial FreeSpace size is fill*(ai[mbs]+1) */
350: PetscFreeSpaceGet((PetscInt)(fill*(ai[mbs]+1)),&free_space);
351: current_space = free_space;
353: for (k=0; k<mbs; k++){ /* for each active row k */
354: /* initialize lnk by the column indices of row rip[k] of A */
355: nzk = 0;
356: ncols = ai[rip[k]+1] - ai[rip[k]];
357: for (j=0; j<ncols; j++){
358: i = *(aj + ai[rip[k]] + j);
359: cols[j] = rip[i];
360: }
361: PetscLLAdd(ncols,cols,mbs,nlnk,lnk,lnkbt);
362: nzk += nlnk;
364: /* update lnk by computing fill-in for each pivot row to be merged in */
365: prow = jl[k]; /* 1st pivot row */
366:
367: while (prow < k){
368: nextprow = jl[prow];
369: /* merge prow into k-th row */
370: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
371: jmax = ui[prow+1];
372: ncols = jmax-jmin;
373: uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
374: PetscLLAddSorted(ncols,uj_ptr,mbs,nlnk,lnk,lnkbt);
375: nzk += nlnk;
377: /* update il and jl for prow */
378: if (jmin < jmax){
379: il[prow] = jmin;
380: j = *uj_ptr; jl[prow] = jl[j]; jl[j] = prow;
381: }
382: prow = nextprow;
383: }
385: /* if free space is not available, make more free space */
386: if (current_space->local_remaining<nzk) {
387: i = mbs - k + 1; /* num of unfactored rows */
388: i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
389: PetscFreeSpaceGet(i,¤t_space);
390: reallocs++;
391: }
393: /* copy data into free space, then initialize lnk */
394: PetscLLClean(mbs,mbs,nzk,lnk,current_space->array,lnkbt);
396: /* add the k-th row into il and jl */
397: if (nzk-1 > 0){
398: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
399: jl[k] = jl[i]; jl[i] = k;
400: il[k] = ui[k] + 1;
401: }
402: ui_ptr[k] = current_space->array;
403: current_space->array += nzk;
404: current_space->local_used += nzk;
405: current_space->local_remaining -= nzk;
407: ui[k+1] = ui[k] + nzk;
408: }
410: #if defined(PETSC_USE_INFO)
411: if (ai[mbs] != 0) {
412: PetscReal af = ((PetscReal)ui[mbs])/((PetscReal)ai[mbs]);
413: PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);
414: PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
415: PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);
416: } else {
417: PetscInfo(A,"Empty matrix.\n");
418: }
419: #endif
421: ISRestoreIndices(perm,&rip);
422: PetscFree(jl);
424: /* destroy list of free space and other temporary array(s) */
425: PetscMalloc((ui[mbs]+1)*sizeof(PetscInt),&uj);
426: PetscFreeSpaceContiguous(&free_space,uj);
427: PetscLLDestroy(lnk,lnkbt);
429: /* put together the new matrix in MATSEQSBAIJ format */
430: MatCreate(PETSC_COMM_SELF,fact);
431: MatSetSizes(*fact,mbs,mbs,mbs,mbs);
432: B = *fact;
433: MatSetType(B,MATSEQSBAIJ);
434: MatSeqSBAIJSetPreallocation_SeqSBAIJ(B,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);
436: b = (Mat_SeqSBAIJ*)B->data;
437: b->singlemalloc = PETSC_FALSE;
438: b->free_a = PETSC_TRUE;
439: b->free_ij = PETSC_TRUE;
440: PetscMalloc((ui[mbs]+1)*sizeof(MatScalar),&b->a);
441: b->j = uj;
442: b->i = ui;
443: b->diag = 0;
444: b->ilen = 0;
445: b->imax = 0;
446: b->row = perm;
447: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
448: PetscObjectReference((PetscObject)perm);
449: b->icol = perm;
450: PetscObjectReference((PetscObject)perm);
451: PetscMalloc((mbs+1)*sizeof(PetscScalar),&b->solve_work);
452: PetscLogObjectMemory(B,(ui[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
453: b->maxnz = b->nz = ui[mbs];
454:
455: B->factor = FACTOR_CHOLESKY;
456: B->info.factor_mallocs = reallocs;
457: B->info.fill_ratio_given = fill;
458: if (ai[mbs] != 0) {
459: B->info.fill_ratio_needed = ((PetscReal)ui[mbs])/((PetscReal)ai[mbs]);
460: } else {
461: B->info.fill_ratio_needed = 0.0;
462: }
464: if (perm_identity){
465: switch (bs) {
466: case 1:
467: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
468: B->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering;
469: B->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_1_NaturalOrdering;
470: B->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering;
471: PetscInfo(A,"Using special in-place natural ordering factor and solve BS=1\n");
472: break;
473: case 2:
474: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
475: B->ops->solve = MatSolve_SeqSBAIJ_2_NaturalOrdering;
476: B->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering;
477: B->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering;
478: PetscInfo(A,"Using special in-place natural ordering factor and solve BS=2\n");
479: break;
480: case 3:
481: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
482: B->ops->solve = MatSolve_SeqSBAIJ_3_NaturalOrdering;
483: B->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_3_NaturalOrdering;
484: B->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering;
485: PetscInfo(A,"Using special in-place natural ordering factor and solve BS=3\n");
486: break;
487: case 4:
488: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
489: B->ops->solve = MatSolve_SeqSBAIJ_4_NaturalOrdering;
490: B->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_4_NaturalOrdering;
491: B->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering;
492: PetscInfo(A,"Using special in-place natural ordering factor and solve BS=4\n");
493: break;
494: case 5:
495: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
496: B->ops->solve = MatSolve_SeqSBAIJ_5_NaturalOrdering;
497: B->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering;
498: B->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering;
499: PetscInfo(A,"Using special in-place natural ordering factor and solve BS=5\n");
500: break;
501: case 6:
502: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
503: B->ops->solve = MatSolve_SeqSBAIJ_6_NaturalOrdering;
504: B->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_6_NaturalOrdering;
505: B->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering;
506: PetscInfo(A,"Using special in-place natural ordering factor and solve BS=6\n");
507: break;
508: case 7:
509: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
510: B->ops->solve = MatSolve_SeqSBAIJ_7_NaturalOrdering;
511: B->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_7_NaturalOrdering;
512: B->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering;
513: PetscInfo(A,"Using special in-place natural ordering factor and solve BS=7\n");
514: break;
515: default:
516: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
517: B->ops->solve = MatSolve_SeqSBAIJ_N_NaturalOrdering;
518: B->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering;
519: B->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering;
520: PetscInfo(A,"Using special in-place natural ordering factor and solve BS>7\n");
521: break;
522: }
523: }
524: return(0);
525: }
528: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat A,MatFactorInfo *info,Mat *B)
529: {
530: Mat C = *B;
531: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
532: IS perm = b->row;
534: PetscInt *perm_ptr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
535: PetscInt *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
536: PetscInt bs=A->rmap.bs,bs2 = a->bs2,bslog = 0;
537: MatScalar *ba = b->a,*aa,*ap,*dk,*uik;
538: MatScalar *u,*diag,*rtmp,*rtmp_ptr;
539: MatScalar *work;
540: PetscInt *pivots;
543: /* initialization */
544: PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);
545: PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));
546: PetscMalloc(2*mbs*sizeof(PetscInt),&il);
547: jl = il + mbs;
548: for (i=0; i<mbs; i++) {
549: jl[i] = mbs; il[0] = 0;
550: }
551: PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);
552: uik = dk + bs2;
553: work = uik + bs2;
554: PetscMalloc(bs*sizeof(PetscInt),&pivots);
555:
556: ISGetIndices(perm,&perm_ptr);
557:
558: /* check permutation */
559: if (!a->permute){
560: ai = a->i; aj = a->j; aa = a->a;
561: } else {
562: ai = a->inew; aj = a->jnew;
563: PetscMalloc(bs2*ai[mbs]*sizeof(MatScalar),&aa);
564: PetscMemcpy(aa,a->a,bs2*ai[mbs]*sizeof(MatScalar));
565: PetscMalloc(ai[mbs]*sizeof(PetscInt),&a2anew);
566: PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));
568: /* flops in while loop */
569: bslog = 2*bs*bs2;
571: for (i=0; i<mbs; i++){
572: jmin = ai[i]; jmax = ai[i+1];
573: for (j=jmin; j<jmax; j++){
574: while (a2anew[j] != j){
575: k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
576: for (k1=0; k1<bs2; k1++){
577: dk[k1] = aa[k*bs2+k1];
578: aa[k*bs2+k1] = aa[j*bs2+k1];
579: aa[j*bs2+k1] = dk[k1];
580: }
581: }
582: /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
583: if (i > aj[j]){
584: /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
585: ap = aa + j*bs2; /* ptr to the beginning of j-th block of aa */
586: for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
587: for (k=0; k<bs; k++){ /* j-th block of aa <- dk^T */
588: for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1];
589: }
590: }
591: }
592: }
593: PetscFree(a2anew);
594: }
595:
596: /* for each row k */
597: for (k = 0; k<mbs; k++){
599: /*initialize k-th row with elements nonzero in row perm(k) of A */
600: jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
601:
602: ap = aa + jmin*bs2;
603: for (j = jmin; j < jmax; j++){
604: vj = perm_ptr[aj[j]]; /* block col. index */
605: rtmp_ptr = rtmp + vj*bs2;
606: for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
607: }
609: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
610: PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));
611: i = jl[k]; /* first row to be added to k_th row */
613: while (i < k){
614: nexti = jl[i]; /* next row to be added to k_th row */
616: /* compute multiplier */
617: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
619: /* uik = -inv(Di)*U_bar(i,k) */
620: diag = ba + i*bs2;
621: u = ba + ili*bs2;
622: PetscMemzero(uik,bs2*sizeof(MatScalar));
623: Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
624:
625: /* update D(k) += -U(i,k)^T * U_bar(i,k) */
626: Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
627: PetscLogFlops(bslog*2);
628:
629: /* update -U(i,k) */
630: PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));
632: /* add multiple of row i to k-th row ... */
633: jmin = ili + 1; jmax = bi[i+1];
634: if (jmin < jmax){
635: for (j=jmin; j<jmax; j++) {
636: /* rtmp += -U(i,k)^T * U_bar(i,j) */
637: rtmp_ptr = rtmp + bj[j]*bs2;
638: u = ba + j*bs2;
639: Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
640: }
641: PetscLogFlops(bslog*(jmax-jmin));
642:
643: /* ... add i to row list for next nonzero entry */
644: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
645: j = bj[jmin];
646: jl[i] = jl[j]; jl[j] = i; /* update jl */
647: }
648: i = nexti;
649: }
651: /* save nonzero entries in k-th row of U ... */
653: /* invert diagonal block */
654: diag = ba+k*bs2;
655: PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));
656: Kernel_A_gets_inverse_A(bs,diag,pivots,work);
657:
658: jmin = bi[k]; jmax = bi[k+1];
659: if (jmin < jmax) {
660: for (j=jmin; j<jmax; j++){
661: vj = bj[j]; /* block col. index of U */
662: u = ba + j*bs2;
663: rtmp_ptr = rtmp + vj*bs2;
664: for (k1=0; k1<bs2; k1++){
665: *u++ = *rtmp_ptr;
666: *rtmp_ptr++ = 0.0;
667: }
668: }
669:
670: /* ... add k to row list for first nonzero entry in k-th row */
671: il[k] = jmin;
672: i = bj[jmin];
673: jl[k] = jl[i]; jl[i] = k;
674: }
675: }
677: PetscFree(rtmp);
678: PetscFree(il);
679: PetscFree(dk);
680: PetscFree(pivots);
681: if (a->permute){
682: PetscFree(aa);
683: }
685: ISRestoreIndices(perm,&perm_ptr);
686: C->factor = FACTOR_CHOLESKY;
687: C->assembled = PETSC_TRUE;
688: C->preallocated = PETSC_TRUE;
689: PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
690: return(0);
691: }
695: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat A,MatFactorInfo *info,Mat *B)
696: {
697: Mat C = *B;
698: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
700: PetscInt i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
701: PetscInt *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
702: PetscInt bs=A->rmap.bs,bs2 = a->bs2,bslog;
703: MatScalar *ba = b->a,*aa,*ap,*dk,*uik;
704: MatScalar *u,*diag,*rtmp,*rtmp_ptr;
705: MatScalar *work;
706: PetscInt *pivots;
709: PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);
710: PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));
711: PetscMalloc(2*mbs*sizeof(PetscInt),&il);
712: jl = il + mbs;
713: for (i=0; i<mbs; i++) {
714: jl[i] = mbs; il[0] = 0;
715: }
716: PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);
717: uik = dk + bs2;
718: work = uik + bs2;
719: PetscMalloc(bs*sizeof(PetscInt),&pivots);
720:
721: ai = a->i; aj = a->j; aa = a->a;
723: /* flops in while loop */
724: bslog = 2*bs*bs2;
725:
726: /* for each row k */
727: for (k = 0; k<mbs; k++){
729: /*initialize k-th row with elements nonzero in row k of A */
730: jmin = ai[k]; jmax = ai[k+1];
731: ap = aa + jmin*bs2;
732: for (j = jmin; j < jmax; j++){
733: vj = aj[j]; /* block col. index */
734: rtmp_ptr = rtmp + vj*bs2;
735: for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
736: }
738: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
739: PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));
740: i = jl[k]; /* first row to be added to k_th row */
742: while (i < k){
743: nexti = jl[i]; /* next row to be added to k_th row */
745: /* compute multiplier */
746: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
748: /* uik = -inv(Di)*U_bar(i,k) */
749: diag = ba + i*bs2;
750: u = ba + ili*bs2;
751: PetscMemzero(uik,bs2*sizeof(MatScalar));
752: Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
753:
754: /* update D(k) += -U(i,k)^T * U_bar(i,k) */
755: Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
756: PetscLogFlops(bslog*2);
757:
758: /* update -U(i,k) */
759: PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));
761: /* add multiple of row i to k-th row ... */
762: jmin = ili + 1; jmax = bi[i+1];
763: if (jmin < jmax){
764: for (j=jmin; j<jmax; j++) {
765: /* rtmp += -U(i,k)^T * U_bar(i,j) */
766: rtmp_ptr = rtmp + bj[j]*bs2;
767: u = ba + j*bs2;
768: Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
769: }
770: PetscLogFlops(bslog*(jmax-jmin));
771:
772: /* ... add i to row list for next nonzero entry */
773: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
774: j = bj[jmin];
775: jl[i] = jl[j]; jl[j] = i; /* update jl */
776: }
777: i = nexti;
778: }
780: /* save nonzero entries in k-th row of U ... */
782: /* invert diagonal block */
783: diag = ba+k*bs2;
784: PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));
785: Kernel_A_gets_inverse_A(bs,diag,pivots,work);
786:
787: jmin = bi[k]; jmax = bi[k+1];
788: if (jmin < jmax) {
789: for (j=jmin; j<jmax; j++){
790: vj = bj[j]; /* block col. index of U */
791: u = ba + j*bs2;
792: rtmp_ptr = rtmp + vj*bs2;
793: for (k1=0; k1<bs2; k1++){
794: *u++ = *rtmp_ptr;
795: *rtmp_ptr++ = 0.0;
796: }
797: }
798:
799: /* ... add k to row list for first nonzero entry in k-th row */
800: il[k] = jmin;
801: i = bj[jmin];
802: jl[k] = jl[i]; jl[i] = k;
803: }
804: }
806: PetscFree(rtmp);
807: PetscFree(il);
808: PetscFree(dk);
809: PetscFree(pivots);
811: C->factor = FACTOR_CHOLESKY;
812: C->assembled = PETSC_TRUE;
813: C->preallocated = PETSC_TRUE;
814: PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
815: return(0);
816: }
818: /*
819: Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
820: Version for blocks 2 by 2.
821: */
824: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat A,MatFactorInfo *info,Mat *B)
825: {
826: Mat C = *B;
827: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
828: IS perm = b->row;
830: PetscInt *perm_ptr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
831: PetscInt *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
832: MatScalar *ba = b->a,*aa,*ap,*dk,*uik;
833: MatScalar *u,*diag,*rtmp,*rtmp_ptr;
836: /* initialization */
837: /* il and jl record the first nonzero element in each row of the accessing
838: window U(0:k, k:mbs-1).
839: jl: list of rows to be added to uneliminated rows
840: i>= k: jl(i) is the first row to be added to row i
841: i< k: jl(i) is the row following row i in some list of rows
842: jl(i) = mbs indicates the end of a list
843: il(i): points to the first nonzero element in columns k,...,mbs-1 of
844: row i of U */
845: PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);
846: PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));
847: PetscMalloc(2*mbs*sizeof(PetscInt),&il);
848: jl = il + mbs;
849: for (i=0; i<mbs; i++) {
850: jl[i] = mbs; il[0] = 0;
851: }
852: PetscMalloc(8*sizeof(MatScalar),&dk);
853: uik = dk + 4;
854: ISGetIndices(perm,&perm_ptr);
856: /* check permutation */
857: if (!a->permute){
858: ai = a->i; aj = a->j; aa = a->a;
859: } else {
860: ai = a->inew; aj = a->jnew;
861: PetscMalloc(4*ai[mbs]*sizeof(MatScalar),&aa);
862: PetscMemcpy(aa,a->a,4*ai[mbs]*sizeof(MatScalar));
863: PetscMalloc(ai[mbs]*sizeof(PetscInt),&a2anew);
864: PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));
866: for (i=0; i<mbs; i++){
867: jmin = ai[i]; jmax = ai[i+1];
868: for (j=jmin; j<jmax; j++){
869: while (a2anew[j] != j){
870: k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
871: for (k1=0; k1<4; k1++){
872: dk[k1] = aa[k*4+k1];
873: aa[k*4+k1] = aa[j*4+k1];
874: aa[j*4+k1] = dk[k1];
875: }
876: }
877: /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
878: if (i > aj[j]){
879: /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
880: ap = aa + j*4; /* ptr to the beginning of the block */
881: dk[1] = ap[1]; /* swap ap[1] and ap[2] */
882: ap[1] = ap[2];
883: ap[2] = dk[1];
884: }
885: }
886: }
887: PetscFree(a2anew);
888: }
890: /* for each row k */
891: for (k = 0; k<mbs; k++){
893: /*initialize k-th row with elements nonzero in row perm(k) of A */
894: jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
895: ap = aa + jmin*4;
896: for (j = jmin; j < jmax; j++){
897: vj = perm_ptr[aj[j]]; /* block col. index */
898: rtmp_ptr = rtmp + vj*4;
899: for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
900: }
902: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
903: PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));
904: i = jl[k]; /* first row to be added to k_th row */
906: while (i < k){
907: nexti = jl[i]; /* next row to be added to k_th row */
909: /* compute multiplier */
910: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
912: /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
913: diag = ba + i*4;
914: u = ba + ili*4;
915: uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
916: uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
917: uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
918: uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
919:
920: /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
921: dk[0] += uik[0]*u[0] + uik[1]*u[1];
922: dk[1] += uik[2]*u[0] + uik[3]*u[1];
923: dk[2] += uik[0]*u[2] + uik[1]*u[3];
924: dk[3] += uik[2]*u[2] + uik[3]*u[3];
926: PetscLogFlops(16*2);
928: /* update -U(i,k): ba[ili] = uik */
929: PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));
931: /* add multiple of row i to k-th row ... */
932: jmin = ili + 1; jmax = bi[i+1];
933: if (jmin < jmax){
934: for (j=jmin; j<jmax; j++) {
935: /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
936: rtmp_ptr = rtmp + bj[j]*4;
937: u = ba + j*4;
938: rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
939: rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
940: rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
941: rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
942: }
943: PetscLogFlops(16*(jmax-jmin));
944:
945: /* ... add i to row list for next nonzero entry */
946: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
947: j = bj[jmin];
948: jl[i] = jl[j]; jl[j] = i; /* update jl */
949: }
950: i = nexti;
951: }
953: /* save nonzero entries in k-th row of U ... */
955: /* invert diagonal block */
956: diag = ba+k*4;
957: PetscMemcpy(diag,dk,4*sizeof(MatScalar));
958: Kernel_A_gets_inverse_A_2(diag);
959:
960: jmin = bi[k]; jmax = bi[k+1];
961: if (jmin < jmax) {
962: for (j=jmin; j<jmax; j++){
963: vj = bj[j]; /* block col. index of U */
964: u = ba + j*4;
965: rtmp_ptr = rtmp + vj*4;
966: for (k1=0; k1<4; k1++){
967: *u++ = *rtmp_ptr;
968: *rtmp_ptr++ = 0.0;
969: }
970: }
971:
972: /* ... add k to row list for first nonzero entry in k-th row */
973: il[k] = jmin;
974: i = bj[jmin];
975: jl[k] = jl[i]; jl[i] = k;
976: }
977: }
979: PetscFree(rtmp);
980: PetscFree(il);
981: PetscFree(dk);
982: if (a->permute) {
983: PetscFree(aa);
984: }
985: ISRestoreIndices(perm,&perm_ptr);
986: C->factor = FACTOR_CHOLESKY;
987: C->assembled = PETSC_TRUE;
988: C->preallocated = PETSC_TRUE;
989: PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
990: return(0);
991: }
993: /*
994: Version for when blocks are 2 by 2 Using natural ordering
995: */
998: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat A,MatFactorInfo *info,Mat *B)
999: {
1000: Mat C = *B;
1001: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
1003: PetscInt i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
1004: PetscInt *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
1005: MatScalar *ba = b->a,*aa,*ap,*dk,*uik;
1006: MatScalar *u,*diag,*rtmp,*rtmp_ptr;
1009: /* initialization */
1010: /* il and jl record the first nonzero element in each row of the accessing
1011: window U(0:k, k:mbs-1).
1012: jl: list of rows to be added to uneliminated rows
1013: i>= k: jl(i) is the first row to be added to row i
1014: i< k: jl(i) is the row following row i in some list of rows
1015: jl(i) = mbs indicates the end of a list
1016: il(i): points to the first nonzero element in columns k,...,mbs-1 of
1017: row i of U */
1018: PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);
1019: PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));
1020: PetscMalloc(2*mbs*sizeof(PetscInt),&il);
1021: jl = il + mbs;
1022: for (i=0; i<mbs; i++) {
1023: jl[i] = mbs; il[0] = 0;
1024: }
1025: PetscMalloc(8*sizeof(MatScalar),&dk);
1026: uik = dk + 4;
1027:
1028: ai = a->i; aj = a->j; aa = a->a;
1030: /* for each row k */
1031: for (k = 0; k<mbs; k++){
1033: /*initialize k-th row with elements nonzero in row k of A */
1034: jmin = ai[k]; jmax = ai[k+1];
1035: ap = aa + jmin*4;
1036: for (j = jmin; j < jmax; j++){
1037: vj = aj[j]; /* block col. index */
1038: rtmp_ptr = rtmp + vj*4;
1039: for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
1040: }
1041:
1042: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
1043: PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));
1044: i = jl[k]; /* first row to be added to k_th row */
1046: while (i < k){
1047: nexti = jl[i]; /* next row to be added to k_th row */
1049: /* compute multiplier */
1050: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1052: /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
1053: diag = ba + i*4;
1054: u = ba + ili*4;
1055: uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
1056: uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
1057: uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
1058: uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
1059:
1060: /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
1061: dk[0] += uik[0]*u[0] + uik[1]*u[1];
1062: dk[1] += uik[2]*u[0] + uik[3]*u[1];
1063: dk[2] += uik[0]*u[2] + uik[1]*u[3];
1064: dk[3] += uik[2]*u[2] + uik[3]*u[3];
1066: PetscLogFlops(16*2);
1068: /* update -U(i,k): ba[ili] = uik */
1069: PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));
1071: /* add multiple of row i to k-th row ... */
1072: jmin = ili + 1; jmax = bi[i+1];
1073: if (jmin < jmax){
1074: for (j=jmin; j<jmax; j++) {
1075: /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
1076: rtmp_ptr = rtmp + bj[j]*4;
1077: u = ba + j*4;
1078: rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
1079: rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
1080: rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
1081: rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
1082: }
1083: PetscLogFlops(16*(jmax-jmin));
1085: /* ... add i to row list for next nonzero entry */
1086: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
1087: j = bj[jmin];
1088: jl[i] = jl[j]; jl[j] = i; /* update jl */
1089: }
1090: i = nexti;
1091: }
1093: /* save nonzero entries in k-th row of U ... */
1095: /* invert diagonal block */
1096: diag = ba+k*4;
1097: PetscMemcpy(diag,dk,4*sizeof(MatScalar));
1098: Kernel_A_gets_inverse_A_2(diag);
1099:
1100: jmin = bi[k]; jmax = bi[k+1];
1101: if (jmin < jmax) {
1102: for (j=jmin; j<jmax; j++){
1103: vj = bj[j]; /* block col. index of U */
1104: u = ba + j*4;
1105: rtmp_ptr = rtmp + vj*4;
1106: for (k1=0; k1<4; k1++){
1107: *u++ = *rtmp_ptr;
1108: *rtmp_ptr++ = 0.0;
1109: }
1110: }
1111:
1112: /* ... add k to row list for first nonzero entry in k-th row */
1113: il[k] = jmin;
1114: i = bj[jmin];
1115: jl[k] = jl[i]; jl[i] = k;
1116: }
1117: }
1119: PetscFree(rtmp);
1120: PetscFree(il);
1121: PetscFree(dk);
1123: C->factor = FACTOR_CHOLESKY;
1124: C->assembled = PETSC_TRUE;
1125: C->preallocated = PETSC_TRUE;
1126: PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
1127: return(0);
1128: }
1130: /*
1131: Numeric U^T*D*U factorization for SBAIJ format.
1132: Version for blocks are 1 by 1.
1133: */
1136: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1(Mat A,MatFactorInfo *info,Mat *B)
1137: {
1138: Mat C = *B;
1139: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ *)C->data;
1140: IS ip=b->row;
1142: PetscInt *rip,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j,*bcol;
1143: PetscInt *ai,*aj,*a2anew;
1144: PetscInt k,jmin,jmax,*jl,*il,col,nexti,ili,nz;
1145: MatScalar *rtmp,*ba=b->a,*bval,*aa,dk,uikdi;
1146: PetscReal zeropivot,rs,shiftnz;
1147: PetscReal shiftpd;
1148: ChShift_Ctx sctx;
1149: PetscInt newshift;
1152: /* initialization */
1153: shiftnz = info->shiftnz;
1154: shiftpd = info->shiftpd;
1155: zeropivot = info->zeropivot;
1157: ISGetIndices(ip,&rip);
1158: if (!a->permute){
1159: ai = a->i; aj = a->j; aa = a->a;
1160: } else {
1161: ai = a->inew; aj = a->jnew;
1162: nz = ai[mbs];
1163: PetscMalloc(nz*sizeof(MatScalar),&aa);
1164: a2anew = a->a2anew;
1165: bval = a->a;
1166: for (j=0; j<nz; j++){
1167: aa[a2anew[j]] = *(bval++);
1168: }
1169: }
1170:
1171: /* initialization */
1172: /* il and jl record the first nonzero element in each row of the accessing
1173: window U(0:k, k:mbs-1).
1174: jl: list of rows to be added to uneliminated rows
1175: i>= k: jl(i) is the first row to be added to row i
1176: i< k: jl(i) is the row following row i in some list of rows
1177: jl(i) = mbs indicates the end of a list
1178: il(i): points to the first nonzero element in columns k,...,mbs-1 of
1179: row i of U */
1180: nz = (2*mbs+1)*sizeof(PetscInt)+mbs*sizeof(MatScalar);
1181: PetscMalloc(nz,&il);
1182: jl = il + mbs;
1183: rtmp = (MatScalar*)(jl + mbs);
1185: sctx.shift_amount = 0;
1186: sctx.nshift = 0;
1187: do {
1188: sctx.chshift = PETSC_FALSE;
1189: for (i=0; i<mbs; i++) {
1190: rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1191: }
1192:
1193: for (k = 0; k<mbs; k++){
1194: /*initialize k-th row by the perm[k]-th row of A */
1195: jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
1196: bval = ba + bi[k];
1197: for (j = jmin; j < jmax; j++){
1198: col = rip[aj[j]];
1199: rtmp[col] = aa[j];
1200: *bval++ = 0.0; /* for in-place factorization */
1201: }
1203: /* shift the diagonal of the matrix */
1204: if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1206: /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1207: dk = rtmp[k];
1208: i = jl[k]; /* first row to be added to k_th row */
1210: while (i < k){
1211: nexti = jl[i]; /* next row to be added to k_th row */
1213: /* compute multiplier, update diag(k) and U(i,k) */
1214: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1215: uikdi = - ba[ili]*ba[bi[i]]; /* diagonal(k) */
1216: dk += uikdi*ba[ili];
1217: ba[ili] = uikdi; /* -U(i,k) */
1219: /* add multiple of row i to k-th row */
1220: jmin = ili + 1; jmax = bi[i+1];
1221: if (jmin < jmax){
1222: for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
1223: PetscLogFlops(2*(jmax-jmin));
1225: /* update il and jl for row i */
1226: il[i] = jmin;
1227: j = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
1228: }
1229: i = nexti;
1230: }
1232: /* shift the diagonals when zero pivot is detected */
1233: /* compute rs=sum of abs(off-diagonal) */
1234: rs = 0.0;
1235: jmin = bi[k]+1;
1236: nz = bi[k+1] - jmin;
1237: if (nz){
1238: bcol = bj + jmin;
1239: while (nz--){
1240: rs += PetscAbsScalar(rtmp[*bcol]);
1241: bcol++;
1242: }
1243: }
1245: sctx.rs = rs;
1246: sctx.pv = dk;
1247: MatCholeskyCheckShift_inline(info,sctx,k,newshift);
1248: if (newshift == 1) break; /* sctx.shift_amount is updated */
1249:
1250: /* copy data into U(k,:) */
1251: ba[bi[k]] = 1.0/dk; /* U(k,k) */
1252: jmin = bi[k]+1; jmax = bi[k+1];
1253: if (jmin < jmax) {
1254: for (j=jmin; j<jmax; j++){
1255: col = bj[j]; ba[j] = rtmp[col]; rtmp[col] = 0.0;
1256: }
1257: /* add the k-th row into il and jl */
1258: il[k] = jmin;
1259: i = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
1260: }
1261: }
1262: } while (sctx.chshift);
1263: PetscFree(il);
1264: if (a->permute){PetscFree(aa);}
1266: ISRestoreIndices(ip,&rip);
1267: C->factor = FACTOR_CHOLESKY;
1268: C->assembled = PETSC_TRUE;
1269: C->preallocated = PETSC_TRUE;
1270: PetscLogFlops(C->rmap.N);
1271: if (sctx.nshift){
1272: if (shiftnz) {
1273: PetscInfo2(0,"number of shiftnz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
1274: } else if (shiftpd) {
1275: PetscInfo2(0,"number of shiftpd tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
1276: }
1277: }
1278: return(0);
1279: }
1281: /*
1282: Version for when blocks are 1 by 1 Using natural ordering
1283: */
1286: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat A,MatFactorInfo *info,Mat *B)
1287: {
1288: Mat C = *B;
1289: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ *)C->data;
1291: PetscInt i,j,mbs = a->mbs;
1292: PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
1293: PetscInt k,jmin,*jl,*il,nexti,ili,*acol,*bcol,nz;
1294: MatScalar *rtmp,*ba=b->a,*aa=a->a,dk,uikdi,*aval,*bval;
1295: PetscReal zeropivot,rs,shiftnz;
1296: PetscReal shiftpd;
1297: ChShift_Ctx sctx;
1298: PetscInt newshift;
1301: /* initialization */
1302: shiftnz = info->shiftnz;
1303: shiftpd = info->shiftpd;
1304: zeropivot = info->zeropivot;
1306: /* il and jl record the first nonzero element in each row of the accessing
1307: window U(0:k, k:mbs-1).
1308: jl: list of rows to be added to uneliminated rows
1309: i>= k: jl(i) is the first row to be added to row i
1310: i< k: jl(i) is the row following row i in some list of rows
1311: jl(i) = mbs indicates the end of a list
1312: il(i): points to the first nonzero element in U(i,k:mbs-1)
1313: */
1314: PetscMalloc(mbs*sizeof(MatScalar),&rtmp);
1315: PetscMalloc(2*mbs*sizeof(PetscInt),&il);
1316: jl = il + mbs;
1318: sctx.shift_amount = 0;
1319: sctx.nshift = 0;
1320: do {
1321: sctx.chshift = PETSC_FALSE;
1322: for (i=0; i<mbs; i++) {
1323: rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1324: }
1326: for (k = 0; k<mbs; k++){
1327: /*initialize k-th row with elements nonzero in row perm(k) of A */
1328: nz = ai[k+1] - ai[k];
1329: acol = aj + ai[k];
1330: aval = aa + ai[k];
1331: bval = ba + bi[k];
1332: while (nz -- ){
1333: rtmp[*acol++] = *aval++;
1334: *bval++ = 0.0; /* for in-place factorization */
1335: }
1337: /* shift the diagonal of the matrix */
1338: if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1339:
1340: /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1341: dk = rtmp[k];
1342: i = jl[k]; /* first row to be added to k_th row */
1344: while (i < k){
1345: nexti = jl[i]; /* next row to be added to k_th row */
1346: /* compute multiplier, update D(k) and U(i,k) */
1347: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1348: uikdi = - ba[ili]*ba[bi[i]];
1349: dk += uikdi*ba[ili];
1350: ba[ili] = uikdi; /* -U(i,k) */
1352: /* add multiple of row i to k-th row ... */
1353: jmin = ili + 1;
1354: nz = bi[i+1] - jmin;
1355: if (nz > 0){
1356: bcol = bj + jmin;
1357: bval = ba + jmin;
1358: PetscLogFlops(2*nz);
1359: while (nz --) rtmp[*bcol++] += uikdi*(*bval++);
1360:
1361: /* update il and jl for i-th row */
1362: il[i] = jmin;
1363: j = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
1364: }
1365: i = nexti;
1366: }
1368: /* shift the diagonals when zero pivot is detected */
1369: /* compute rs=sum of abs(off-diagonal) */
1370: rs = 0.0;
1371: jmin = bi[k]+1;
1372: nz = bi[k+1] - jmin;
1373: if (nz){
1374: bcol = bj + jmin;
1375: while (nz--){
1376: rs += PetscAbsScalar(rtmp[*bcol]);
1377: bcol++;
1378: }
1379: }
1381: sctx.rs = rs;
1382: sctx.pv = dk;
1383: MatCholeskyCheckShift_inline(info,sctx,k,newshift);
1384: if (newshift == 1) break; /* sctx.shift_amount is updated */
1385:
1386: /* copy data into U(k,:) */
1387: ba[bi[k]] = 1.0/dk;
1388: jmin = bi[k]+1;
1389: nz = bi[k+1] - jmin;
1390: if (nz){
1391: bcol = bj + jmin;
1392: bval = ba + jmin;
1393: while (nz--){
1394: *bval++ = rtmp[*bcol];
1395: rtmp[*bcol++] = 0.0;
1396: }
1397: /* add k-th row into il and jl */
1398: il[k] = jmin;
1399: i = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
1400: }
1401: } /* end of for (k = 0; k<mbs; k++) */
1402: } while (sctx.chshift);
1403: PetscFree(rtmp);
1404: PetscFree(il);
1405:
1406: C->factor = FACTOR_CHOLESKY;
1407: C->assembled = PETSC_TRUE;
1408: C->preallocated = PETSC_TRUE;
1409: PetscLogFlops(C->rmap.N);
1410: if (sctx.nshift){
1411: if (shiftnz) {
1412: PetscInfo2(0,"number of shiftnz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
1413: } else if (shiftpd) {
1414: PetscInfo2(0,"number of shiftpd tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
1415: }
1416: }
1417: return(0);
1418: }
1422: PetscErrorCode MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info)
1423: {
1425: Mat C;
1428: MatCholeskyFactorSymbolic(A,perm,info,&C);
1429: MatCholeskyFactorNumeric(A,info,&C);
1430: MatHeaderCopy(A,C);
1431: return(0);
1432: }