Actual source code: inode.c
1: #define PETSCMAT_DLL
3: /*
4: This file provides high performance routines for the Inode format (compressed sparse row)
5: by taking advantage of rows with identical nonzero structure (I-nodes).
6: */
7: #include src/mat/impls/aij/seq/aij.h
11: static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12: {
13: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
15: PetscInt i,count,m,n,min_mn,*ns_row,*ns_col;
18: n = A->cmap.n;
19: m = A->rmap.n;
20: ns_row = a->inode.size;
21:
22: min_mn = (m < n) ? m : n;
23: if (!ns) {
24: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25: for(; count+1 < n; count++,i++);
26: if (count < n) {
27: i++;
28: }
29: *size = i;
30: return(0);
31: }
32: PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);
33:
34: /* Use the same row structure wherever feasible. */
35: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36: ns_col[i] = ns_row[i];
37: }
39: /* if m < n; pad up the remainder with inode_limit */
40: for(; count+1 < n; count++,i++) {
41: ns_col[i] = 1;
42: }
43: /* The last node is the odd ball. padd it up with the remaining rows; */
44: if (count < n) {
45: ns_col[i] = n - count;
46: i++;
47: } else if (count > n) {
48: /* Adjust for the over estimation */
49: ns_col[i-1] += n - count;
50: }
51: *size = i;
52: *ns = ns_col;
53: return(0);
54: }
57: /*
58: This builds symmetric version of nonzero structure,
59: */
62: static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63: {
64: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
66: PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67: PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
70: nslim_row = a->inode.node_count;
71: m = A->rmap.n;
72: n = A->cmap.n;
73: if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square");
74:
75: /* Use the row_inode as column_inode */
76: nslim_col = nslim_row;
77: ns_col = ns_row;
79: /* allocate space for reformated inode structure */
80: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);
81: PetscMalloc((n+1)*sizeof(PetscInt),&tvc);
82: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
84: for (i1=0,col=0; i1<nslim_col; ++i1){
85: nsz = ns_col[i1];
86: for (i2=0; i2<nsz; ++i2,++col)
87: tvc[col] = i1;
88: }
89: /* allocate space for row pointers */
90: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
91: *iia = ia;
92: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
93: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);
95: /* determine the number of columns in each row */
96: ia[0] = oshift;
97: for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
99: j = aj + ai[row] + ishift;
100: jmax = aj + ai[row+1] + ishift;
101: i2 = 0;
102: col = *j++ + ishift;
103: i2 = tvc[col];
104: while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105: ia[i1+1]++;
106: ia[i2+1]++;
107: i2++; /* Start col of next node */
108: while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109: i2 = tvc[col];
110: }
111: if(i2 == i1) ia[i2+1]++; /* now the diagonal element */
112: }
114: /* shift ia[i] to point to next row */
115: for (i1=1; i1<nslim_row+1; i1++) {
116: row = ia[i1-1];
117: ia[i1] += row;
118: work[i1-1] = row - oshift;
119: }
121: /* allocate space for column pointers */
122: nz = ia[nslim_row] + (!ishift);
123: PetscMalloc(nz*sizeof(PetscInt),&ja);
124: *jja = ja;
126: /* loop over lower triangular part putting into ja */
127: for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128: j = aj + ai[row] + ishift;
129: jmax = aj + ai[row+1] + ishift;
130: i2 = 0; /* Col inode index */
131: col = *j++ + ishift;
132: i2 = tvc[col];
133: while (i2<i1 && j<jmax) {
134: ja[work[i2]++] = i1 + oshift;
135: ja[work[i1]++] = i2 + oshift;
136: ++i2;
137: while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138: i2 = tvc[col];
139: }
140: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
142: }
143: PetscFree(work);
144: PetscFree(tns);
145: PetscFree(tvc);
146: return(0);
147: }
149: /*
150: This builds nonsymmetric version of nonzero structure,
151: */
154: static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155: {
156: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
158: PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
162: nslim_row = a->inode.node_count;
163: n = A->cmap.n;
165: /* Create The column_inode for this matrix */
166: Mat_CreateColInode(A,&nslim_col,&ns_col);
167:
168: /* allocate space for reformated column_inode structure */
169: PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);
170: PetscMalloc((n +1)*sizeof(PetscInt),&tvc);
171: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
173: for (i1=0,col=0; i1<nslim_col; ++i1){
174: nsz = ns_col[i1];
175: for (i2=0; i2<nsz; ++i2,++col)
176: tvc[col] = i1;
177: }
178: /* allocate space for row pointers */
179: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
180: *iia = ia;
181: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
182: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);
184: /* determine the number of columns in each row */
185: ia[0] = oshift;
186: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187: j = aj + ai[row] + ishift;
188: col = *j++ + ishift;
189: i2 = tvc[col];
190: nz = ai[row+1] - ai[row];
191: while (nz-- > 0) { /* off-diagonal elemets */
192: ia[i1+1]++;
193: i2++; /* Start col of next node */
194: while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195: if (nz > 0) i2 = tvc[col];
196: }
197: }
199: /* shift ia[i] to point to next row */
200: for (i1=1; i1<nslim_row+1; i1++) {
201: row = ia[i1-1];
202: ia[i1] += row;
203: work[i1-1] = row - oshift;
204: }
206: /* allocate space for column pointers */
207: nz = ia[nslim_row] + (!ishift);
208: PetscMalloc(nz*sizeof(PetscInt),&ja);
209: *jja = ja;
211: /* loop over matrix putting into ja */
212: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213: j = aj + ai[row] + ishift;
214: i2 = 0; /* Col inode index */
215: col = *j++ + ishift;
216: i2 = tvc[col];
217: nz = ai[row+1] - ai[row];
218: while (nz-- > 0) {
219: ja[work[i1]++] = i2 + oshift;
220: ++i2;
221: while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222: if (nz > 0) i2 = tvc[col];
223: }
224: }
225: PetscFree(ns_col);
226: PetscFree(work);
227: PetscFree(tns);
228: PetscFree(tvc);
229: return(0);
230: }
234: static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235: {
236: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
240: *n = a->inode.node_count;
241: if (!ia) return(0);
242: if (!blockcompressed) {
243: MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
244: } else if (symmetric) {
245: MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);
246: } else {
247: MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
248: }
249: return(0);
250: }
254: static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255: {
259: if (!ia) return(0);
261: if (!blockcompressed) {
262: MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
263: } else {
264: PetscFree(*ia);
265: PetscFree(*ja);
266: }
268: return(0);
269: }
271: /* ----------------------------------------------------------- */
275: static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276: {
277: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
279: PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
283: nslim_row = a->inode.node_count;
284: n = A->cmap.n;
286: /* Create The column_inode for this matrix */
287: Mat_CreateColInode(A,&nslim_col,&ns_col);
288:
289: /* allocate space for reformated column_inode structure */
290: PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);
291: PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);
292: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
294: for (i1=0,col=0; i1<nslim_col; ++i1){
295: nsz = ns_col[i1];
296: for (i2=0; i2<nsz; ++i2,++col)
297: tvc[col] = i1;
298: }
299: /* allocate space for column pointers */
300: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);
301: *iia = ia;
302: PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
303: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);
305: /* determine the number of columns in each row */
306: ia[0] = oshift;
307: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308: j = aj + ai[row] + ishift;
309: col = *j++ + ishift;
310: i2 = tvc[col];
311: nz = ai[row+1] - ai[row];
312: while (nz-- > 0) { /* off-diagonal elemets */
313: /* ia[i1+1]++; */
314: ia[i2+1]++;
315: i2++;
316: while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317: if (nz > 0) i2 = tvc[col];
318: }
319: }
321: /* shift ia[i] to point to next col */
322: for (i1=1; i1<nslim_col+1; i1++) {
323: col = ia[i1-1];
324: ia[i1] += col;
325: work[i1-1] = col - oshift;
326: }
328: /* allocate space for column pointers */
329: nz = ia[nslim_col] + (!ishift);
330: PetscMalloc(nz*sizeof(PetscInt),&ja);
331: *jja = ja;
333: /* loop over matrix putting into ja */
334: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335: j = aj + ai[row] + ishift;
336: i2 = 0; /* Col inode index */
337: col = *j++ + ishift;
338: i2 = tvc[col];
339: nz = ai[row+1] - ai[row];
340: while (nz-- > 0) {
341: /* ja[work[i1]++] = i2 + oshift; */
342: ja[work[i2]++] = i1 + oshift;
343: i2++;
344: while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345: if (nz > 0) i2 = tvc[col];
346: }
347: }
348: PetscFree(ns_col);
349: PetscFree(work);
350: PetscFree(tns);
351: PetscFree(tvc);
352: return(0);
353: }
357: static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358: {
362: Mat_CreateColInode(A,n,PETSC_NULL);
363: if (!ia) return(0);
365: if (!blockcompressed) {
366: MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
367: } else if (symmetric) {
368: /* Since the indices are symmetric it does'nt matter */
369: MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);
370: } else {
371: MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
372: }
373: return(0);
374: }
378: static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379: {
383: if (!ia) return(0);
384: if (!blockcompressed) {
385: MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
386: } else {
387: PetscFree(*ia);
388: PetscFree(*ja);
389: }
390: return(0);
391: }
393: /* ----------------------------------------------------------- */
397: static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy)
398: {
399: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
400: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401: PetscScalar *v1,*v2,*v3,*v4,*v5,*x,*y;
403: PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
404:
405: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
406: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
407: #endif
410: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
411: node_max = a->inode.node_count;
412: ns = a->inode.size; /* Node Size array */
413: VecGetArray(xx,&x);
414: VecGetArray(yy,&y);
415: idx = a->j;
416: v1 = a->a;
417: ii = a->i;
419: for (i = 0,row = 0; i< node_max; ++i){
420: nsz = ns[i];
421: n = ii[1] - ii[0];
422: ii += nsz;
423: sz = n; /* No of non zeros in this row */
424: /* Switch on the size of Node */
425: switch (nsz){ /* Each loop in 'case' is unrolled */
426: case 1 :
427: sum1 = 0;
428:
429: for(n = 0; n< sz-1; n+=2) {
430: i1 = idx[0]; /* The instructions are ordered to */
431: i2 = idx[1]; /* make the compiler's job easy */
432: idx += 2;
433: tmp0 = x[i1];
434: tmp1 = x[i2];
435: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
436: }
437:
438: if (n == sz-1){ /* Take care of the last nonzero */
439: tmp0 = x[*idx++];
440: sum1 += *v1++ * tmp0;
441: }
442: y[row++]=sum1;
443: break;
444: case 2:
445: sum1 = 0;
446: sum2 = 0;
447: v2 = v1 + n;
448:
449: for (n = 0; n< sz-1; n+=2) {
450: i1 = idx[0];
451: i2 = idx[1];
452: idx += 2;
453: tmp0 = x[i1];
454: tmp1 = x[i2];
455: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
456: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
457: }
458: if (n == sz-1){
459: tmp0 = x[*idx++];
460: sum1 += *v1++ * tmp0;
461: sum2 += *v2++ * tmp0;
462: }
463: y[row++]=sum1;
464: y[row++]=sum2;
465: v1 =v2; /* Since the next block to be processed starts there*/
466: idx +=sz;
467: break;
468: case 3:
469: sum1 = 0;
470: sum2 = 0;
471: sum3 = 0;
472: v2 = v1 + n;
473: v3 = v2 + n;
474:
475: for (n = 0; n< sz-1; n+=2) {
476: i1 = idx[0];
477: i2 = idx[1];
478: idx += 2;
479: tmp0 = x[i1];
480: tmp1 = x[i2];
481: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
482: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
483: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
484: }
485: if (n == sz-1){
486: tmp0 = x[*idx++];
487: sum1 += *v1++ * tmp0;
488: sum2 += *v2++ * tmp0;
489: sum3 += *v3++ * tmp0;
490: }
491: y[row++]=sum1;
492: y[row++]=sum2;
493: y[row++]=sum3;
494: v1 =v3; /* Since the next block to be processed starts there*/
495: idx +=2*sz;
496: break;
497: case 4:
498: sum1 = 0;
499: sum2 = 0;
500: sum3 = 0;
501: sum4 = 0;
502: v2 = v1 + n;
503: v3 = v2 + n;
504: v4 = v3 + n;
505:
506: for (n = 0; n< sz-1; n+=2) {
507: i1 = idx[0];
508: i2 = idx[1];
509: idx += 2;
510: tmp0 = x[i1];
511: tmp1 = x[i2];
512: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
513: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
514: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
515: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
516: }
517: if (n == sz-1){
518: tmp0 = x[*idx++];
519: sum1 += *v1++ * tmp0;
520: sum2 += *v2++ * tmp0;
521: sum3 += *v3++ * tmp0;
522: sum4 += *v4++ * tmp0;
523: }
524: y[row++]=sum1;
525: y[row++]=sum2;
526: y[row++]=sum3;
527: y[row++]=sum4;
528: v1 =v4; /* Since the next block to be processed starts there*/
529: idx +=3*sz;
530: break;
531: case 5:
532: sum1 = 0;
533: sum2 = 0;
534: sum3 = 0;
535: sum4 = 0;
536: sum5 = 0;
537: v2 = v1 + n;
538: v3 = v2 + n;
539: v4 = v3 + n;
540: v5 = v4 + n;
541:
542: for (n = 0; n<sz-1; n+=2) {
543: i1 = idx[0];
544: i2 = idx[1];
545: idx += 2;
546: tmp0 = x[i1];
547: tmp1 = x[i2];
548: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
549: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
550: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
551: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
552: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
553: }
554: if (n == sz-1){
555: tmp0 = x[*idx++];
556: sum1 += *v1++ * tmp0;
557: sum2 += *v2++ * tmp0;
558: sum3 += *v3++ * tmp0;
559: sum4 += *v4++ * tmp0;
560: sum5 += *v5++ * tmp0;
561: }
562: y[row++]=sum1;
563: y[row++]=sum2;
564: y[row++]=sum3;
565: y[row++]=sum4;
566: y[row++]=sum5;
567: v1 =v5; /* Since the next block to be processed starts there */
568: idx +=4*sz;
569: break;
570: default :
571: SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
572: }
573: }
574: VecRestoreArray(xx,&x);
575: VecRestoreArray(yy,&y);
576: PetscLogFlops(2*a->nz - A->rmap.n);
577: return(0);
578: }
579: /* ----------------------------------------------------------- */
580: /* Almost same code as the MatMult_Inode() */
583: static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy)
584: {
585: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
586: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
587: PetscScalar *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt;
589: PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
590:
592: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
593: node_max = a->inode.node_count;
594: ns = a->inode.size; /* Node Size array */
595: VecGetArray(xx,&x);
596: VecGetArray(yy,&y);
597: if (zz != yy) {
598: VecGetArray(zz,&z);
599: } else {
600: z = y;
601: }
602: zt = z;
604: idx = a->j;
605: v1 = a->a;
606: ii = a->i;
608: for (i = 0,row = 0; i< node_max; ++i){
609: nsz = ns[i];
610: n = ii[1] - ii[0];
611: ii += nsz;
612: sz = n; /* No of non zeros in this row */
613: /* Switch on the size of Node */
614: switch (nsz){ /* Each loop in 'case' is unrolled */
615: case 1 :
616: sum1 = *zt++;
617:
618: for(n = 0; n< sz-1; n+=2) {
619: i1 = idx[0]; /* The instructions are ordered to */
620: i2 = idx[1]; /* make the compiler's job easy */
621: idx += 2;
622: tmp0 = x[i1];
623: tmp1 = x[i2];
624: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
625: }
626:
627: if(n == sz-1){ /* Take care of the last nonzero */
628: tmp0 = x[*idx++];
629: sum1 += *v1++ * tmp0;
630: }
631: y[row++]=sum1;
632: break;
633: case 2:
634: sum1 = *zt++;
635: sum2 = *zt++;
636: v2 = v1 + n;
637:
638: for(n = 0; n< sz-1; n+=2) {
639: i1 = idx[0];
640: i2 = idx[1];
641: idx += 2;
642: tmp0 = x[i1];
643: tmp1 = x[i2];
644: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
645: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
646: }
647: if(n == sz-1){
648: tmp0 = x[*idx++];
649: sum1 += *v1++ * tmp0;
650: sum2 += *v2++ * tmp0;
651: }
652: y[row++]=sum1;
653: y[row++]=sum2;
654: v1 =v2; /* Since the next block to be processed starts there*/
655: idx +=sz;
656: break;
657: case 3:
658: sum1 = *zt++;
659: sum2 = *zt++;
660: sum3 = *zt++;
661: v2 = v1 + n;
662: v3 = v2 + n;
663:
664: for (n = 0; n< sz-1; n+=2) {
665: i1 = idx[0];
666: i2 = idx[1];
667: idx += 2;
668: tmp0 = x[i1];
669: tmp1 = x[i2];
670: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
671: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
672: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
673: }
674: if (n == sz-1){
675: tmp0 = x[*idx++];
676: sum1 += *v1++ * tmp0;
677: sum2 += *v2++ * tmp0;
678: sum3 += *v3++ * tmp0;
679: }
680: y[row++]=sum1;
681: y[row++]=sum2;
682: y[row++]=sum3;
683: v1 =v3; /* Since the next block to be processed starts there*/
684: idx +=2*sz;
685: break;
686: case 4:
687: sum1 = *zt++;
688: sum2 = *zt++;
689: sum3 = *zt++;
690: sum4 = *zt++;
691: v2 = v1 + n;
692: v3 = v2 + n;
693: v4 = v3 + n;
694:
695: for (n = 0; n< sz-1; n+=2) {
696: i1 = idx[0];
697: i2 = idx[1];
698: idx += 2;
699: tmp0 = x[i1];
700: tmp1 = x[i2];
701: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
702: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
703: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
704: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
705: }
706: if (n == sz-1){
707: tmp0 = x[*idx++];
708: sum1 += *v1++ * tmp0;
709: sum2 += *v2++ * tmp0;
710: sum3 += *v3++ * tmp0;
711: sum4 += *v4++ * tmp0;
712: }
713: y[row++]=sum1;
714: y[row++]=sum2;
715: y[row++]=sum3;
716: y[row++]=sum4;
717: v1 =v4; /* Since the next block to be processed starts there*/
718: idx +=3*sz;
719: break;
720: case 5:
721: sum1 = *zt++;
722: sum2 = *zt++;
723: sum3 = *zt++;
724: sum4 = *zt++;
725: sum5 = *zt++;
726: v2 = v1 + n;
727: v3 = v2 + n;
728: v4 = v3 + n;
729: v5 = v4 + n;
730:
731: for (n = 0; n<sz-1; n+=2) {
732: i1 = idx[0];
733: i2 = idx[1];
734: idx += 2;
735: tmp0 = x[i1];
736: tmp1 = x[i2];
737: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
738: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
739: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
740: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
741: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
742: }
743: if(n == sz-1){
744: tmp0 = x[*idx++];
745: sum1 += *v1++ * tmp0;
746: sum2 += *v2++ * tmp0;
747: sum3 += *v3++ * tmp0;
748: sum4 += *v4++ * tmp0;
749: sum5 += *v5++ * tmp0;
750: }
751: y[row++]=sum1;
752: y[row++]=sum2;
753: y[row++]=sum3;
754: y[row++]=sum4;
755: y[row++]=sum5;
756: v1 =v5; /* Since the next block to be processed starts there */
757: idx +=4*sz;
758: break;
759: default :
760: SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
761: }
762: }
763: VecRestoreArray(xx,&x);
764: VecRestoreArray(yy,&y);
765: if (zz != yy) {
766: VecRestoreArray(zz,&z);
767: }
768: PetscLogFlops(2*a->nz);
769: return(0);
770: }
772: /* ----------------------------------------------------------- */
775: PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx)
776: {
777: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
778: IS iscol = a->col,isrow = a->row;
780: PetscInt *r,*c,i,j,n = A->rmap.n,*ai = a->i,nz,*a_j = a->j;
781: PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout;
782: PetscScalar *x,*b,*a_a = a->a,*tmp,*tmps,*aa,tmp0,tmp1;
783: PetscScalar sum1,sum2,sum3,sum4,sum5,*v1,*v2,*v3,*v4,*v5;
786: if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix");
787: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
788: node_max = a->inode.node_count;
789: ns = a->inode.size; /* Node Size array */
791: VecGetArray(bb,&b);
792: VecGetArray(xx,&x);
793: tmp = a->solve_work;
794:
795: ISGetIndices(isrow,&rout); r = rout;
796: ISGetIndices(iscol,&cout); c = cout + (n-1);
797:
798: /* forward solve the lower triangular */
799: tmps = tmp ;
800: aa = a_a ;
801: aj = a_j ;
802: ad = a->diag;
804: for (i = 0,row = 0; i< node_max; ++i){
805: nsz = ns[i];
806: aii = ai[row];
807: v1 = aa + aii;
808: vi = aj + aii;
809: nz = ad[row]- aii;
810:
811: switch (nsz){ /* Each loop in 'case' is unrolled */
812: case 1 :
813: sum1 = b[*r++];
814: /* while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
815: for(j=0; j<nz-1; j+=2){
816: i0 = vi[0];
817: i1 = vi[1];
818: vi +=2;
819: tmp0 = tmps[i0];
820: tmp1 = tmps[i1];
821: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
822: }
823: if(j == nz-1){
824: tmp0 = tmps[*vi++];
825: sum1 -= *v1++ *tmp0;
826: }
827: tmp[row ++]=sum1;
828: break;
829: case 2:
830: sum1 = b[*r++];
831: sum2 = b[*r++];
832: v2 = aa + ai[row+1];
834: for(j=0; j<nz-1; j+=2){
835: i0 = vi[0];
836: i1 = vi[1];
837: vi +=2;
838: tmp0 = tmps[i0];
839: tmp1 = tmps[i1];
840: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
841: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
842: }
843: if(j == nz-1){
844: tmp0 = tmps[*vi++];
845: sum1 -= *v1++ *tmp0;
846: sum2 -= *v2++ *tmp0;
847: }
848: sum2 -= *v2++ * sum1;
849: tmp[row ++]=sum1;
850: tmp[row ++]=sum2;
851: break;
852: case 3:
853: sum1 = b[*r++];
854: sum2 = b[*r++];
855: sum3 = b[*r++];
856: v2 = aa + ai[row+1];
857: v3 = aa + ai[row+2];
858:
859: for (j=0; j<nz-1; j+=2){
860: i0 = vi[0];
861: i1 = vi[1];
862: vi +=2;
863: tmp0 = tmps[i0];
864: tmp1 = tmps[i1];
865: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
866: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
867: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
868: }
869: if (j == nz-1){
870: tmp0 = tmps[*vi++];
871: sum1 -= *v1++ *tmp0;
872: sum2 -= *v2++ *tmp0;
873: sum3 -= *v3++ *tmp0;
874: }
875: sum2 -= *v2++ * sum1;
876: sum3 -= *v3++ * sum1;
877: sum3 -= *v3++ * sum2;
878: tmp[row ++]=sum1;
879: tmp[row ++]=sum2;
880: tmp[row ++]=sum3;
881: break;
882:
883: case 4:
884: sum1 = b[*r++];
885: sum2 = b[*r++];
886: sum3 = b[*r++];
887: sum4 = b[*r++];
888: v2 = aa + ai[row+1];
889: v3 = aa + ai[row+2];
890: v4 = aa + ai[row+3];
891:
892: for (j=0; j<nz-1; j+=2){
893: i0 = vi[0];
894: i1 = vi[1];
895: vi +=2;
896: tmp0 = tmps[i0];
897: tmp1 = tmps[i1];
898: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
899: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
900: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
901: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
902: }
903: if (j == nz-1){
904: tmp0 = tmps[*vi++];
905: sum1 -= *v1++ *tmp0;
906: sum2 -= *v2++ *tmp0;
907: sum3 -= *v3++ *tmp0;
908: sum4 -= *v4++ *tmp0;
909: }
910: sum2 -= *v2++ * sum1;
911: sum3 -= *v3++ * sum1;
912: sum4 -= *v4++ * sum1;
913: sum3 -= *v3++ * sum2;
914: sum4 -= *v4++ * sum2;
915: sum4 -= *v4++ * sum3;
916:
917: tmp[row ++]=sum1;
918: tmp[row ++]=sum2;
919: tmp[row ++]=sum3;
920: tmp[row ++]=sum4;
921: break;
922: case 5:
923: sum1 = b[*r++];
924: sum2 = b[*r++];
925: sum3 = b[*r++];
926: sum4 = b[*r++];
927: sum5 = b[*r++];
928: v2 = aa + ai[row+1];
929: v3 = aa + ai[row+2];
930: v4 = aa + ai[row+3];
931: v5 = aa + ai[row+4];
932:
933: for (j=0; j<nz-1; j+=2){
934: i0 = vi[0];
935: i1 = vi[1];
936: vi +=2;
937: tmp0 = tmps[i0];
938: tmp1 = tmps[i1];
939: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
940: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
941: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
942: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
943: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
944: }
945: if (j == nz-1){
946: tmp0 = tmps[*vi++];
947: sum1 -= *v1++ *tmp0;
948: sum2 -= *v2++ *tmp0;
949: sum3 -= *v3++ *tmp0;
950: sum4 -= *v4++ *tmp0;
951: sum5 -= *v5++ *tmp0;
952: }
954: sum2 -= *v2++ * sum1;
955: sum3 -= *v3++ * sum1;
956: sum4 -= *v4++ * sum1;
957: sum5 -= *v5++ * sum1;
958: sum3 -= *v3++ * sum2;
959: sum4 -= *v4++ * sum2;
960: sum5 -= *v5++ * sum2;
961: sum4 -= *v4++ * sum3;
962: sum5 -= *v5++ * sum3;
963: sum5 -= *v5++ * sum4;
964:
965: tmp[row ++]=sum1;
966: tmp[row ++]=sum2;
967: tmp[row ++]=sum3;
968: tmp[row ++]=sum4;
969: tmp[row ++]=sum5;
970: break;
971: default:
972: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
973: }
974: }
975: /* backward solve the upper triangular */
976: for (i=node_max -1 ,row = n-1 ; i>=0; i--){
977: nsz = ns[i];
978: aii = ai[row+1] -1;
979: v1 = aa + aii;
980: vi = aj + aii;
981: nz = aii- ad[row];
982: switch (nsz){ /* Each loop in 'case' is unrolled */
983: case 1 :
984: sum1 = tmp[row];
986: for(j=nz ; j>1; j-=2){
987: vi -=2;
988: i0 = vi[2];
989: i1 = vi[1];
990: tmp0 = tmps[i0];
991: tmp1 = tmps[i1];
992: v1 -= 2;
993: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
994: }
995: if (j==1){
996: tmp0 = tmps[*vi--];
997: sum1 -= *v1-- * tmp0;
998: }
999: x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1000: break;
1001: case 2 :
1002: sum1 = tmp[row];
1003: sum2 = tmp[row -1];
1004: v2 = aa + ai[row]-1;
1005: for (j=nz ; j>1; j-=2){
1006: vi -=2;
1007: i0 = vi[2];
1008: i1 = vi[1];
1009: tmp0 = tmps[i0];
1010: tmp1 = tmps[i1];
1011: v1 -= 2;
1012: v2 -= 2;
1013: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1014: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1015: }
1016: if (j==1){
1017: tmp0 = tmps[*vi--];
1018: sum1 -= *v1-- * tmp0;
1019: sum2 -= *v2-- * tmp0;
1020: }
1021:
1022: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1023: sum2 -= *v2-- * tmp0;
1024: x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1025: break;
1026: case 3 :
1027: sum1 = tmp[row];
1028: sum2 = tmp[row -1];
1029: sum3 = tmp[row -2];
1030: v2 = aa + ai[row]-1;
1031: v3 = aa + ai[row -1]-1;
1032: for (j=nz ; j>1; j-=2){
1033: vi -=2;
1034: i0 = vi[2];
1035: i1 = vi[1];
1036: tmp0 = tmps[i0];
1037: tmp1 = tmps[i1];
1038: v1 -= 2;
1039: v2 -= 2;
1040: v3 -= 2;
1041: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1042: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1043: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1044: }
1045: if (j==1){
1046: tmp0 = tmps[*vi--];
1047: sum1 -= *v1-- * tmp0;
1048: sum2 -= *v2-- * tmp0;
1049: sum3 -= *v3-- * tmp0;
1050: }
1051: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1052: sum2 -= *v2-- * tmp0;
1053: sum3 -= *v3-- * tmp0;
1054: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1055: sum3 -= *v3-- * tmp0;
1056: x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1057:
1058: break;
1059: case 4 :
1060: sum1 = tmp[row];
1061: sum2 = tmp[row -1];
1062: sum3 = tmp[row -2];
1063: sum4 = tmp[row -3];
1064: v2 = aa + ai[row]-1;
1065: v3 = aa + ai[row -1]-1;
1066: v4 = aa + ai[row -2]-1;
1068: for (j=nz ; j>1; j-=2){
1069: vi -=2;
1070: i0 = vi[2];
1071: i1 = vi[1];
1072: tmp0 = tmps[i0];
1073: tmp1 = tmps[i1];
1074: v1 -= 2;
1075: v2 -= 2;
1076: v3 -= 2;
1077: v4 -= 2;
1078: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1079: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1080: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1081: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1082: }
1083: if (j==1){
1084: tmp0 = tmps[*vi--];
1085: sum1 -= *v1-- * tmp0;
1086: sum2 -= *v2-- * tmp0;
1087: sum3 -= *v3-- * tmp0;
1088: sum4 -= *v4-- * tmp0;
1089: }
1091: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1092: sum2 -= *v2-- * tmp0;
1093: sum3 -= *v3-- * tmp0;
1094: sum4 -= *v4-- * tmp0;
1095: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1096: sum3 -= *v3-- * tmp0;
1097: sum4 -= *v4-- * tmp0;
1098: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1099: sum4 -= *v4-- * tmp0;
1100: x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1101: break;
1102: case 5 :
1103: sum1 = tmp[row];
1104: sum2 = tmp[row -1];
1105: sum3 = tmp[row -2];
1106: sum4 = tmp[row -3];
1107: sum5 = tmp[row -4];
1108: v2 = aa + ai[row]-1;
1109: v3 = aa + ai[row -1]-1;
1110: v4 = aa + ai[row -2]-1;
1111: v5 = aa + ai[row -3]-1;
1112: for (j=nz ; j>1; j-=2){
1113: vi -= 2;
1114: i0 = vi[2];
1115: i1 = vi[1];
1116: tmp0 = tmps[i0];
1117: tmp1 = tmps[i1];
1118: v1 -= 2;
1119: v2 -= 2;
1120: v3 -= 2;
1121: v4 -= 2;
1122: v5 -= 2;
1123: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1124: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1125: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1126: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1127: sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1128: }
1129: if (j==1){
1130: tmp0 = tmps[*vi--];
1131: sum1 -= *v1-- * tmp0;
1132: sum2 -= *v2-- * tmp0;
1133: sum3 -= *v3-- * tmp0;
1134: sum4 -= *v4-- * tmp0;
1135: sum5 -= *v5-- * tmp0;
1136: }
1138: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1139: sum2 -= *v2-- * tmp0;
1140: sum3 -= *v3-- * tmp0;
1141: sum4 -= *v4-- * tmp0;
1142: sum5 -= *v5-- * tmp0;
1143: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1144: sum3 -= *v3-- * tmp0;
1145: sum4 -= *v4-- * tmp0;
1146: sum5 -= *v5-- * tmp0;
1147: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1148: sum4 -= *v4-- * tmp0;
1149: sum5 -= *v5-- * tmp0;
1150: tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1151: sum5 -= *v5-- * tmp0;
1152: x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1153: break;
1154: default:
1155: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1156: }
1157: }
1158: ISRestoreIndices(isrow,&rout);
1159: ISRestoreIndices(iscol,&cout);
1160: VecRestoreArray(bb,&b);
1161: VecRestoreArray(xx,&x);
1162: PetscLogFlops(2*a->nz - A->cmap.n);
1163: return(0);
1164: }
1168: PetscErrorCode MatLUFactorNumeric_Inode(Mat A,MatFactorInfo *info,Mat *B)
1169: {
1170: Mat C = *B;
1171: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1172: IS iscol = b->col,isrow = b->row,isicol = b->icol;
1174: PetscInt *r,*ic,*c,n = A->rmap.n,*bi = b->i;
1175: PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1176: PetscInt *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1177: PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1178: PetscScalar *rtmp1,*rtmp2,*rtmp3,*v1,*v2,*v3,*pc1,*pc2,*pc3,mul1,mul2,mul3;
1179: PetscScalar tmp,*ba = b->a,*aa = a->a,*pv;
1180: PetscReal rs=0.0;
1181: LUShift_Ctx sctx;
1182: PetscInt newshift;
1185: sctx.shift_top = 0;
1186: sctx.nshift_max = 0;
1187: sctx.shift_lo = 0;
1188: sctx.shift_hi = 0;
1190: /* if both shift schemes are chosen by user, only use info->shiftpd */
1191: if (info->shiftpd && info->shiftnz) info->shiftnz = 0.0;
1192: if (info->shiftpd) { /* set sctx.shift_top=max{rs} */
1193: sctx.shift_top = 0;
1194: for (i=0; i<n; i++) {
1195: /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1196: rs = 0.0;
1197: ajtmp = aj + ai[i];
1198: rtmp1 = aa + ai[i];
1199: nz = ai[i+1] - ai[i];
1200: for (j=0; j<nz; j++){
1201: if (*ajtmp != i){
1202: rs += PetscAbsScalar(*rtmp1++);
1203: } else {
1204: rs -= PetscRealPart(*rtmp1++);
1205: }
1206: ajtmp++;
1207: }
1208: if (rs>sctx.shift_top) sctx.shift_top = rs;
1209: }
1210: if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1211: sctx.shift_top *= 1.1;
1212: sctx.nshift_max = 5;
1213: sctx.shift_lo = 0.;
1214: sctx.shift_hi = 1.;
1215: }
1216: sctx.shift_amount = 0;
1217: sctx.nshift = 0;
1219: ISGetIndices(isrow,&r);
1220: ISGetIndices(iscol,&c);
1221: ISGetIndices(isicol,&ic);
1222: PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);
1223: PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));
1224: ics = ic ;
1225: rtmp2 = rtmp1 + n;
1226: rtmp3 = rtmp2 + n;
1227:
1228: node_max = a->inode.node_count;
1229: ns = a->inode.size ;
1230: if (!ns){
1231: SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1232: }
1234: /* If max inode size > 3, split it into two inodes.*/
1235: /* also map the inode sizes according to the ordering */
1236: PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);
1237: for (i=0,j=0; i<node_max; ++i,++j){
1238: if (ns[i]>3) {
1239: tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */
1240: ++j;
1241: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1242: } else {
1243: tmp_vec1[j] = ns[i];
1244: }
1245: }
1246: /* Use the correct node_max */
1247: node_max = j;
1249: /* Now reorder the inode info based on mat re-ordering info */
1250: /* First create a row -> inode_size_array_index map */
1251: PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);
1252: PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);
1253: for (i=0,row=0; i<node_max; i++) {
1254: nodesz = tmp_vec1[i];
1255: for (j=0; j<nodesz; j++,row++) {
1256: nsmap[row] = i;
1257: }
1258: }
1259: /* Using nsmap, create a reordered ns structure */
1260: for (i=0,j=0; i< node_max; i++) {
1261: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1262: tmp_vec2[i] = nodesz;
1263: j += nodesz;
1264: }
1265: PetscFree(nsmap);
1266: PetscFree(tmp_vec1);
1267: /* Now use the correct ns */
1268: ns = tmp_vec2;
1270: do {
1271: sctx.lushift = PETSC_FALSE;
1272: /* Now loop over each block-row, and do the factorization */
1273: for (i=0,row=0; i<node_max; i++) {
1274: nodesz = ns[i];
1275: nz = bi[row+1] - bi[row];
1276: bjtmp = bj + bi[row];
1278: switch (nodesz){
1279: case 1:
1280: for (j=0; j<nz; j++){
1281: idx = bjtmp[j];
1282: rtmp1[idx] = 0.0;
1283: }
1284:
1285: /* load in initial (unfactored row) */
1286: idx = r[row];
1287: nz_tmp = ai[idx+1] - ai[idx];
1288: ajtmp = aj + ai[idx];
1289: v1 = aa + ai[idx];
1291: for (j=0; j<nz_tmp; j++) {
1292: idx = ics[ajtmp[j]];
1293: rtmp1[idx] = v1[j];
1294: }
1295: rtmp1[ics[r[row]]] += sctx.shift_amount;
1297: prow = *bjtmp++ ;
1298: while (prow < row) {
1299: pc1 = rtmp1 + prow;
1300: if (*pc1 != 0.0){
1301: pv = ba + bd[prow];
1302: pj = nbj + bd[prow];
1303: mul1 = *pc1 * *pv++;
1304: *pc1 = mul1;
1305: nz_tmp = bi[prow+1] - bd[prow] - 1;
1306: PetscLogFlops(2*nz_tmp);
1307: for (j=0; j<nz_tmp; j++) {
1308: tmp = pv[j];
1309: idx = pj[j];
1310: rtmp1[idx] -= mul1 * tmp;
1311: }
1312: }
1313: prow = *bjtmp++ ;
1314: }
1315: pj = bj + bi[row];
1316: pc1 = ba + bi[row];
1318: sctx.pv = rtmp1[row];
1319: rtmp1[row] = 1.0/rtmp1[row]; /* invert diag */
1320: rs = 0.0;
1321: for (j=0; j<nz; j++) {
1322: idx = pj[j];
1323: pc1[j] = rtmp1[idx]; /* rtmp1 -> ba */
1324: if (idx != row) rs += PetscAbsScalar(pc1[j]);
1325: }
1326: sctx.rs = rs;
1327: MatLUCheckShift_inline(info,sctx,row,newshift);
1328: if (newshift == 1) goto endofwhile;
1329: break;
1330:
1331: case 2:
1332: for (j=0; j<nz; j++) {
1333: idx = bjtmp[j];
1334: rtmp1[idx] = 0.0;
1335: rtmp2[idx] = 0.0;
1336: }
1337:
1338: /* load in initial (unfactored row) */
1339: idx = r[row];
1340: nz_tmp = ai[idx+1] - ai[idx];
1341: ajtmp = aj + ai[idx];
1342: v1 = aa + ai[idx];
1343: v2 = aa + ai[idx+1];
1344: for (j=0; j<nz_tmp; j++) {
1345: idx = ics[ajtmp[j]];
1346: rtmp1[idx] = v1[j];
1347: rtmp2[idx] = v2[j];
1348: }
1349: rtmp1[ics[r[row]]] += sctx.shift_amount;
1350: rtmp2[ics[r[row+1]]] += sctx.shift_amount;
1352: prow = *bjtmp++ ;
1353: while (prow < row) {
1354: pc1 = rtmp1 + prow;
1355: pc2 = rtmp2 + prow;
1356: if (*pc1 != 0.0 || *pc2 != 0.0){
1357: pv = ba + bd[prow];
1358: pj = nbj + bd[prow];
1359: mul1 = *pc1 * *pv;
1360: mul2 = *pc2 * *pv;
1361: ++pv;
1362: *pc1 = mul1;
1363: *pc2 = mul2;
1364:
1365: nz_tmp = bi[prow+1] - bd[prow] - 1;
1366: for (j=0; j<nz_tmp; j++) {
1367: tmp = pv[j];
1368: idx = pj[j];
1369: rtmp1[idx] -= mul1 * tmp;
1370: rtmp2[idx] -= mul2 * tmp;
1371: }
1372: PetscLogFlops(4*nz_tmp);
1373: }
1374: prow = *bjtmp++ ;
1375: }
1377: /* Now take care of diagonal 2x2 block. Note: prow = row here */
1378: pc1 = rtmp1 + prow;
1379: pc2 = rtmp2 + prow;
1381: sctx.pv = *pc1;
1382: pj = bj + bi[prow];
1383: rs = 0.0;
1384: for (j=0; j<nz; j++){
1385: idx = pj[j];
1386: if (idx != prow) rs += PetscAbsScalar(rtmp1[idx]);
1387: }
1388: sctx.rs = rs;
1389: MatLUCheckShift_inline(info,sctx,row,newshift);
1390: if (newshift == 1) goto endofwhile;
1392: if (*pc2 != 0.0){
1393: pj = nbj + bd[prow];
1394: mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1395: *pc2 = mul2;
1396: nz_tmp = bi[prow+1] - bd[prow] - 1;
1397: for (j=0; j<nz_tmp; j++) {
1398: idx = pj[j] ;
1399: tmp = rtmp1[idx];
1400: rtmp2[idx] -= mul2 * tmp;
1401: }
1402: PetscLogFlops(2*nz_tmp);
1403: }
1404:
1405: pj = bj + bi[row];
1406: pc1 = ba + bi[row];
1407: pc2 = ba + bi[row+1];
1409: sctx.pv = rtmp2[row+1];
1410: rs = 0.0;
1411: rtmp1[row] = 1.0/rtmp1[row];
1412: rtmp2[row+1] = 1.0/rtmp2[row+1];
1413: /* copy row entries from dense representation to sparse */
1414: for (j=0; j<nz; j++) {
1415: idx = pj[j];
1416: pc1[j] = rtmp1[idx];
1417: pc2[j] = rtmp2[idx];
1418: if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1419: }
1420: sctx.rs = rs;
1421: MatLUCheckShift_inline(info,sctx,row+1,newshift);
1422: if (newshift == 1) goto endofwhile;
1423: break;
1425: case 3:
1426: for (j=0; j<nz; j++) {
1427: idx = bjtmp[j];
1428: rtmp1[idx] = 0.0;
1429: rtmp2[idx] = 0.0;
1430: rtmp3[idx] = 0.0;
1431: }
1432: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1433: idx = r[row];
1434: nz_tmp = ai[idx+1] - ai[idx];
1435: ajtmp = aj + ai[idx];
1436: v1 = aa + ai[idx];
1437: v2 = aa + ai[idx+1];
1438: v3 = aa + ai[idx+2];
1439: for (j=0; j<nz_tmp; j++) {
1440: idx = ics[ajtmp[j]];
1441: rtmp1[idx] = v1[j];
1442: rtmp2[idx] = v2[j];
1443: rtmp3[idx] = v3[j];
1444: }
1445: rtmp1[ics[r[row]]] += sctx.shift_amount;
1446: rtmp2[ics[r[row+1]]] += sctx.shift_amount;
1447: rtmp3[ics[r[row+2]]] += sctx.shift_amount;
1449: /* loop over all pivot row blocks above this row block */
1450: prow = *bjtmp++ ;
1451: while (prow < row) {
1452: pc1 = rtmp1 + prow;
1453: pc2 = rtmp2 + prow;
1454: pc3 = rtmp3 + prow;
1455: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1456: pv = ba + bd[prow];
1457: pj = nbj + bd[prow];
1458: mul1 = *pc1 * *pv;
1459: mul2 = *pc2 * *pv;
1460: mul3 = *pc3 * *pv;
1461: ++pv;
1462: *pc1 = mul1;
1463: *pc2 = mul2;
1464: *pc3 = mul3;
1465:
1466: nz_tmp = bi[prow+1] - bd[prow] - 1;
1467: /* update this row based on pivot row */
1468: for (j=0; j<nz_tmp; j++) {
1469: tmp = pv[j];
1470: idx = pj[j];
1471: rtmp1[idx] -= mul1 * tmp;
1472: rtmp2[idx] -= mul2 * tmp;
1473: rtmp3[idx] -= mul3 * tmp;
1474: }
1475: PetscLogFlops(6*nz_tmp);
1476: }
1477: prow = *bjtmp++ ;
1478: }
1480: /* Now take care of diagonal 3x3 block in this set of rows */
1481: /* note: prow = row here */
1482: pc1 = rtmp1 + prow;
1483: pc2 = rtmp2 + prow;
1484: pc3 = rtmp3 + prow;
1486: sctx.pv = *pc1;
1487: pj = bj + bi[prow];
1488: rs = 0.0;
1489: for (j=0; j<nz; j++){
1490: idx = pj[j];
1491: if (idx != row) rs += PetscAbsScalar(rtmp1[idx]);
1492: }
1493: sctx.rs = rs;
1494: MatLUCheckShift_inline(info,sctx,row,newshift);
1495: if (newshift == 1) goto endofwhile;
1497: if (*pc2 != 0.0 || *pc3 != 0.0){
1498: mul2 = (*pc2)/(*pc1);
1499: mul3 = (*pc3)/(*pc1);
1500: *pc2 = mul2;
1501: *pc3 = mul3;
1502: nz_tmp = bi[prow+1] - bd[prow] - 1;
1503: pj = nbj + bd[prow];
1504: for (j=0; j<nz_tmp; j++) {
1505: idx = pj[j] ;
1506: tmp = rtmp1[idx];
1507: rtmp2[idx] -= mul2 * tmp;
1508: rtmp3[idx] -= mul3 * tmp;
1509: }
1510: PetscLogFlops(4*nz_tmp);
1511: }
1512: ++prow;
1514: pc2 = rtmp2 + prow;
1515: pc3 = rtmp3 + prow;
1516: sctx.pv = *pc2;
1517: pj = bj + bi[prow];
1518: rs = 0.0;
1519: for (j=0; j<nz; j++){
1520: idx = pj[j];
1521: if (idx != prow) rs += PetscAbsScalar(rtmp2[idx]);
1522: }
1523: sctx.rs = rs;
1524: MatLUCheckShift_inline(info,sctx,row+1,newshift);
1525: if (newshift == 1) goto endofwhile;
1527: if (*pc3 != 0.0){
1528: mul3 = (*pc3)/(*pc2);
1529: *pc3 = mul3;
1530: pj = nbj + bd[prow];
1531: nz_tmp = bi[prow+1] - bd[prow] - 1;
1532: for (j=0; j<nz_tmp; j++) {
1533: idx = pj[j] ;
1534: tmp = rtmp2[idx];
1535: rtmp3[idx] -= mul3 * tmp;
1536: }
1537: PetscLogFlops(4*nz_tmp);
1538: }
1540: pj = bj + bi[row];
1541: pc1 = ba + bi[row];
1542: pc2 = ba + bi[row+1];
1543: pc3 = ba + bi[row+2];
1545: sctx.pv = rtmp3[row+2];
1546: rs = 0.0;
1547: rtmp1[row] = 1.0/rtmp1[row];
1548: rtmp2[row+1] = 1.0/rtmp2[row+1];
1549: rtmp3[row+2] = 1.0/rtmp3[row+2];
1550: /* copy row entries from dense representation to sparse */
1551: for (j=0; j<nz; j++) {
1552: idx = pj[j];
1553: pc1[j] = rtmp1[idx];
1554: pc2[j] = rtmp2[idx];
1555: pc3[j] = rtmp3[idx];
1556: if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
1557: }
1559: sctx.rs = rs;
1560: MatLUCheckShift_inline(info,sctx,row+2,newshift);
1561: if (newshift == 1) goto endofwhile;
1562: break;
1564: default:
1565: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1566: }
1567: row += nodesz; /* Update the row */
1568: }
1569: endofwhile:;
1570: } while (sctx.lushift);
1571: PetscFree(rtmp1);
1572: PetscFree(tmp_vec2);
1573: ISRestoreIndices(isicol,&ic);
1574: ISRestoreIndices(isrow,&r);
1575: ISRestoreIndices(iscol,&c);
1576: C->factor = FACTOR_LU;
1577: C->assembled = PETSC_TRUE;
1578: if (sctx.nshift) {
1579: if (info->shiftnz) {
1580: PetscInfo2(0,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
1581: } else if (info->shiftpd) {
1582: PetscInfo4(0,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,info->shift_fraction,sctx.shift_top);
1583: }
1584: }
1585: PetscLogFlops(C->cmap.n);
1586: return(0);
1587: }
1589: /*
1590: Makes a longer coloring[] array and calls the usual code with that
1591: */
1594: PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
1595: {
1596: Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data;
1597: PetscErrorCode ierr;
1598: PetscInt n = mat->cmap.n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1599: PetscInt *colorused,i;
1600: ISColoringValue *newcolor;
1603: PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);
1604: /* loop over inodes, marking a color for each column*/
1605: row = 0;
1606: for (i=0; i<m; i++){
1607: for (j=0; j<ns[i]; j++) {
1608: newcolor[row++] = coloring[i] + j*ncolors;
1609: }
1610: }
1612: /* eliminate unneeded colors */
1613: PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);
1614: PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));
1615: for (i=0; i<n; i++) {
1616: colorused[newcolor[i]] = 1;
1617: }
1619: for (i=1; i<5*ncolors; i++) {
1620: colorused[i] += colorused[i-1];
1621: }
1622: ncolors = colorused[5*ncolors-1];
1623: for (i=0; i<n; i++) {
1624: newcolor[i] = colorused[newcolor[i]]-1;
1625: }
1626: PetscFree(colorused);
1627: ISColoringCreate(mat->comm,ncolors,n,newcolor,iscoloring);
1628: PetscFree(coloring);
1629: return(0);
1630: }
1632: /*
1633: samestructure indicates that the matrix has not changed its nonzero structure so we
1634: do not need to recompute the inodes
1635: */
1638: PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
1639: {
1640: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1642: PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
1643: PetscTruth flag;
1646: if (!a->inode.use) return(0);
1647: if (a->inode.checked && samestructure) return(0);
1650: m = A->rmap.n;
1651: if (a->inode.size) {ns = a->inode.size;}
1652: else {PetscMalloc((m+1)*sizeof(PetscInt),&ns);}
1654: i = 0;
1655: node_count = 0;
1656: idx = a->j;
1657: ii = a->i;
1658: while (i < m){ /* For each row */
1659: nzx = ii[i+1] - ii[i]; /* Number of nonzeros */
1660: /* Limits the number of elements in a node to 'a->inode.limit' */
1661: for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
1662: nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */
1663: if (nzy != nzx) break;
1664: idy += nzx; /* Same nonzero pattern */
1665: PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
1666: if (!flag) break;
1667: }
1668: ns[node_count++] = blk_size;
1669: idx += blk_size*nzx;
1670: i = j;
1671: }
1672: /* If not enough inodes found,, do not use inode version of the routines */
1673: if (!a->inode.size && m && node_count > 0.9*m) {
1674: PetscFree(ns);
1675: a->inode.node_count = 0;
1676: a->inode.size = PETSC_NULL;
1677: a->inode.use = PETSC_FALSE;
1678: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
1679: } else {
1680: A->ops->mult = MatMult_Inode;
1681: A->ops->multadd = MatMultAdd_Inode;
1682: A->ops->solve = MatSolve_Inode;
1683: A->ops->lufactornumeric = MatLUFactorNumeric_Inode;
1684: A->ops->getrowij = MatGetRowIJ_Inode;
1685: A->ops->restorerowij = MatRestoreRowIJ_Inode;
1686: A->ops->getcolumnij = MatGetColumnIJ_Inode;
1687: A->ops->restorecolumnij = MatRestoreColumnIJ_Inode;
1688: A->ops->coloringpatch = MatColoringPatch_Inode;
1689: a->inode.node_count = node_count;
1690: a->inode.size = ns;
1691: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
1692: }
1693: return(0);
1694: }
1696: /*
1697: This is really ugly. if inodes are used this replaces the
1698: permutations with ones that correspond to rows/cols of the matrix
1699: rather then inode blocks
1700: */
1703: PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
1704: {
1705: PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
1708: PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);
1709: if (f) {
1710: (*f)(A,rperm,cperm);
1711: }
1712: return(0);
1713: }
1718: PetscErrorCode MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm)
1719: {
1720: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
1722: PetscInt m = A->rmap.n,n = A->cmap.n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count;
1723: PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx;
1724: PetscInt nslim_col,*ns_col;
1725: IS ris = *rperm,cis = *cperm;
1728: if (!a->inode.size) return(0); /* no inodes so return */
1729: if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */
1731: Mat_CreateColInode(A,&nslim_col,&ns_col);
1732: PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);
1733: PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);
1734: permc = permr + m;
1736: ISGetIndices(ris,&ridx);
1737: ISGetIndices(cis,&cidx);
1739: /* Form the inode structure for the rows of permuted matric using inv perm*/
1740: for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
1742: /* Construct the permutations for rows*/
1743: for (i=0,row = 0; i<nslim_row; ++i){
1744: indx = ridx[i];
1745: start_val = tns[indx];
1746: end_val = tns[indx + 1];
1747: for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
1748: }
1750: /* Form the inode structure for the columns of permuted matrix using inv perm*/
1751: for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
1753: /* Construct permutations for columns */
1754: for (i=0,col=0; i<nslim_col; ++i){
1755: indx = cidx[i];
1756: start_val = tns[indx];
1757: end_val = tns[indx + 1];
1758: for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
1759: }
1761: ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);
1762: ISSetPermutation(*rperm);
1763: ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);
1764: ISSetPermutation(*cperm);
1765:
1766: ISRestoreIndices(ris,&ridx);
1767: ISRestoreIndices(cis,&cidx);
1769: PetscFree(ns_col);
1770: PetscFree(permr);
1771: ISDestroy(cis);
1772: ISDestroy(ris);
1773: PetscFree(tns);
1774: return(0);
1775: }
1780: /*@C
1781: MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
1783: Collective on Mat
1785: Input Parameter:
1786: . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
1788: Output Parameter:
1789: + node_count - no of inodes present in the matrix.
1790: . sizes - an array of size node_count,with sizes of each inode.
1791: - limit - the max size used to generate the inodes.
1793: Level: advanced
1795: Notes: This routine returns some internal storage information
1796: of the matrix, it is intended to be used by advanced users.
1797: It should be called after the matrix is assembled.
1798: The contents of the sizes[] array should not be changed.
1799: PETSC_NULL may be passed for information not requested.
1801: .keywords: matrix, seqaij, get, inode
1803: .seealso: MatGetInfo()
1804: @*/
1805: PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
1806: {
1807: PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
1810: if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1811: PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);
1812: if (f) {
1813: (*f)(A,node_count,sizes,limit);
1814: }
1815: return(0);
1816: }
1821: PetscErrorCode MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
1822: {
1823: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1826: if (node_count) *node_count = a->inode.node_count;
1827: if (sizes) *sizes = a->inode.size;
1828: if (limit) *limit = a->inode.limit;
1829: return(0);
1830: }