Actual source code: baij2.c
1: #define PETSCMAT_DLL
3: #include src/mat/impls/baij/seq/baij.h
4: #include src/inline/spops.h
5: #include src/inline/ilu.h
6: #include petscbt.h
10: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
11: {
12: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
14: PetscInt row,i,j,k,l,m,n,*idx,*nidx,isz,val,ival;
15: PetscInt start,end,*ai,*aj,bs,*nidx2;
16: PetscBT table;
19: m = a->mbs;
20: ai = a->i;
21: aj = a->j;
22: bs = A->rmap.bs;
24: if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26: PetscBTCreate(m,table);
27: PetscMalloc((m+1)*sizeof(PetscInt),&nidx);
28: PetscMalloc((A->rmap.N+1)*sizeof(PetscInt),&nidx2);
30: for (i=0; i<is_max; i++) {
31: /* Initialise the two local arrays */
32: isz = 0;
33: PetscBTMemzero(m,table);
34:
35: /* Extract the indices, assume there can be duplicate entries */
36: ISGetIndices(is[i],&idx);
37: ISGetLocalSize(is[i],&n);
39: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40: for (j=0; j<n ; ++j){
41: ival = idx[j]/bs; /* convert the indices into block indices */
42: if (ival>=m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
43: if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
44: }
45: ISRestoreIndices(is[i],&idx);
46: ISDestroy(is[i]);
47:
48: k = 0;
49: for (j=0; j<ov; j++){ /* for each overlap*/
50: n = isz;
51: for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
52: row = nidx[k];
53: start = ai[row];
54: end = ai[row+1];
55: for (l = start; l<end ; l++){
56: val = aj[l];
57: if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
58: }
59: }
60: }
61: /* expand the Index Set */
62: for (j=0; j<isz; j++) {
63: for (k=0; k<bs; k++)
64: nidx2[j*bs+k] = nidx[j]*bs+k;
65: }
66: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);
67: }
68: PetscBTDestroy(table);
69: PetscFree(nidx);
70: PetscFree(nidx2);
71: return(0);
72: }
76: PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
77: {
78: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c;
80: PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
81: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
82: PetscInt *irow,*icol,nrows,ncols,*ssmap,bs=A->rmap.bs,bs2=a->bs2;
83: PetscInt *aj = a->j,*ai = a->i;
84: MatScalar *mat_a;
85: Mat C;
86: PetscTruth flag;
89: ISSorted(iscol,(PetscTruth*)&i);
90: if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
92: ISGetIndices(isrow,&irow);
93: ISGetIndices(iscol,&icol);
94: ISGetLocalSize(isrow,&nrows);
95: ISGetLocalSize(iscol,&ncols);
97: PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);
98: ssmap = smap;
99: PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);
100: PetscMemzero(smap,oldcols*sizeof(PetscInt));
101: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
102: /* determine lens of each row */
103: for (i=0; i<nrows; i++) {
104: kstart = ai[irow[i]];
105: kend = kstart + a->ilen[irow[i]];
106: lens[i] = 0;
107: for (k=kstart; k<kend; k++) {
108: if (ssmap[aj[k]]) {
109: lens[i]++;
110: }
111: }
112: }
113: /* Create and fill new matrix */
114: if (scall == MAT_REUSE_MATRIX) {
115: c = (Mat_SeqBAIJ *)((*B)->data);
117: if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap.bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
118: PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
119: if (!flag) {
120: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
121: }
122: PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
123: C = *B;
124: } else {
125: MatCreate(A->comm,&C);
126: MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
127: MatSetType(C,A->type_name);
128: MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);
129: }
130: c = (Mat_SeqBAIJ *)(C->data);
131: for (i=0; i<nrows; i++) {
132: row = irow[i];
133: kstart = ai[row];
134: kend = kstart + a->ilen[row];
135: mat_i = c->i[i];
136: mat_j = c->j + mat_i;
137: mat_a = c->a + mat_i*bs2;
138: mat_ilen = c->ilen + i;
139: for (k=kstart; k<kend; k++) {
140: if ((tcol=ssmap[a->j[k]])) {
141: *mat_j++ = tcol - 1;
142: PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
143: mat_a += bs2;
144: (*mat_ilen)++;
145: }
146: }
147: }
148:
149: /* Free work space */
150: ISRestoreIndices(iscol,&icol);
151: PetscFree(smap);
152: PetscFree(lens);
153: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
154: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
155:
156: ISRestoreIndices(isrow,&irow);
157: *B = C;
158: return(0);
159: }
163: PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
164: {
165: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
166: IS is1,is2;
168: PetscInt *vary,*iary,*irow,*icol,nrows,ncols,i,bs=A->rmap.bs,count;
171: ISGetIndices(isrow,&irow);
172: ISGetIndices(iscol,&icol);
173: ISGetLocalSize(isrow,&nrows);
174: ISGetLocalSize(iscol,&ncols);
175:
176: /* Verify if the indices corespond to each element in a block
177: and form the IS with compressed IS */
178: PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);
179: iary = vary + a->mbs;
180: PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
181: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
182: count = 0;
183: for (i=0; i<a->mbs; i++) {
184: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
185: if (vary[i]==bs) iary[count++] = i;
186: }
187: ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);
188:
189: PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
190: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
191: count = 0;
192: for (i=0; i<a->mbs; i++) {
193: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_PLIB,"Internal error in PETSc");
194: if (vary[i]==bs) iary[count++] = i;
195: }
196: ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);
197: ISRestoreIndices(isrow,&irow);
198: ISRestoreIndices(iscol,&icol);
199: PetscFree(vary);
201: MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B);
202: ISDestroy(is1);
203: ISDestroy(is2);
204: return(0);
205: }
209: PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
210: {
212: PetscInt i;
215: if (scall == MAT_INITIAL_MATRIX) {
216: PetscMalloc((n+1)*sizeof(Mat),B);
217: }
219: for (i=0; i<n; i++) {
220: MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);
221: }
222: return(0);
223: }
226: /* -------------------------------------------------------*/
227: /* Should check that shapes of vectors and matrices match */
228: /* -------------------------------------------------------*/
229: #include petscblaslapack.h
233: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
234: {
235: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
236: PetscScalar *x,*z,sum;
237: MatScalar *v;
239: PetscInt mbs,i,*idx,*ii,n,*ridx=PETSC_NULL;
240: PetscTruth usecprow=a->compressedrow.use;
243: VecGetArray(xx,&x);
244: VecGetArray(zz,&z);
246: idx = a->j;
247: v = a->a;
248: if (usecprow){
249: mbs = a->compressedrow.nrows;
250: ii = a->compressedrow.i;
251: ridx = a->compressedrow.rindex;
252: } else {
253: mbs = a->mbs;
254: ii = a->i;
255: }
257: for (i=0; i<mbs; i++) {
258: n = ii[1] - ii[0]; ii++;
259: sum = 0.0;
260: while (n--) sum += *v++ * x[*idx++];
261: if (usecprow){
262: z[ridx[i]] = sum;
263: } else {
264: z[i] = sum;
265: }
266: }
267: VecRestoreArray(xx,&x);
268: VecRestoreArray(zz,&z);
269: PetscLogFlops(2*a->nz - mbs);
270: return(0);
271: }
275: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
276: {
277: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
278: PetscScalar *x,*z = 0,*xb,sum1,sum2,*zarray;
279: PetscScalar x1,x2;
280: MatScalar *v;
282: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
283: PetscTruth usecprow=a->compressedrow.use;
286: VecGetArray(xx,&x);
287: VecGetArray(zz,&zarray);
289: idx = a->j;
290: v = a->a;
291: if (usecprow){
292: mbs = a->compressedrow.nrows;
293: ii = a->compressedrow.i;
294: ridx = a->compressedrow.rindex;
295: } else {
296: mbs = a->mbs;
297: ii = a->i;
298: z = zarray;
299: }
301: for (i=0; i<mbs; i++) {
302: n = ii[1] - ii[0]; ii++;
303: sum1 = 0.0; sum2 = 0.0;
304: for (j=0; j<n; j++) {
305: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
306: sum1 += v[0]*x1 + v[2]*x2;
307: sum2 += v[1]*x1 + v[3]*x2;
308: v += 4;
309: }
310: if (usecprow) z = zarray + 2*ridx[i];
311: z[0] = sum1; z[1] = sum2;
312: if (!usecprow) z += 2;
313: }
314: VecRestoreArray(xx,&x);
315: VecRestoreArray(zz,&zarray);
316: PetscLogFlops(8*a->nz - 2*mbs);
317: return(0);
318: }
322: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
323: {
324: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
325: PetscScalar *x,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*zarray;
326: MatScalar *v;
328: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
329: PetscTruth usecprow=a->compressedrow.use;
330:
332: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
333: #pragma disjoint(*v,*z,*xb)
334: #endif
337: VecGetArray(xx,&x);
338: VecGetArray(zz,&zarray);
340: idx = a->j;
341: v = a->a;
342: if (usecprow){
343: mbs = a->compressedrow.nrows;
344: ii = a->compressedrow.i;
345: ridx = a->compressedrow.rindex;
346: } else {
347: mbs = a->mbs;
348: ii = a->i;
349: z = zarray;
350: }
352: for (i=0; i<mbs; i++) {
353: n = ii[1] - ii[0]; ii++;
354: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
355: for (j=0; j<n; j++) {
356: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
357: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
358: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
359: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
360: v += 9;
361: }
362: if (usecprow) z = zarray + 3*ridx[i];
363: z[0] = sum1; z[1] = sum2; z[2] = sum3;
364: if (!usecprow) z += 3;
365: }
366: VecRestoreArray(xx,&x);
367: VecRestoreArray(zz,&zarray);
368: PetscLogFlops(18*a->nz - 3*mbs);
369: return(0);
370: }
374: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
375: {
376: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
377: PetscScalar *x,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
378: MatScalar *v;
380: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
381: PetscTruth usecprow=a->compressedrow.use;
384: VecGetArray(xx,&x);
385: VecGetArray(zz,&zarray);
387: idx = a->j;
388: v = a->a;
389: if (usecprow){
390: mbs = a->compressedrow.nrows;
391: ii = a->compressedrow.i;
392: ridx = a->compressedrow.rindex;
393: } else {
394: mbs = a->mbs;
395: ii = a->i;
396: z = zarray;
397: }
399: for (i=0; i<mbs; i++) {
400: n = ii[1] - ii[0]; ii++;
401: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
402: for (j=0; j<n; j++) {
403: xb = x + 4*(*idx++);
404: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
405: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
406: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
407: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
408: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
409: v += 16;
410: }
411: if (usecprow) z = zarray + 4*ridx[i];
412: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
413: if (!usecprow) z += 4;
414: }
415: VecRestoreArray(xx,&x);
416: VecRestoreArray(zz,&zarray);
417: PetscLogFlops(32*a->nz - 4*mbs);
418: return(0);
419: }
423: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
424: {
425: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
426: PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*xb,*z = 0,*x,*zarray;
427: MatScalar *v;
429: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
430: PetscTruth usecprow=a->compressedrow.use;
433: VecGetArray(xx,&x);
434: VecGetArray(zz,&zarray);
436: idx = a->j;
437: v = a->a;
438: if (usecprow){
439: mbs = a->compressedrow.nrows;
440: ii = a->compressedrow.i;
441: ridx = a->compressedrow.rindex;
442: } else {
443: mbs = a->mbs;
444: ii = a->i;
445: z = zarray;
446: }
448: for (i=0; i<mbs; i++) {
449: n = ii[1] - ii[0]; ii++;
450: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
451: for (j=0; j<n; j++) {
452: xb = x + 5*(*idx++);
453: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
454: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
455: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
456: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
457: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
458: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
459: v += 25;
460: }
461: if (usecprow) z = zarray + 5*ridx[i];
462: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
463: if (!usecprow) z += 5;
464: }
465: VecRestoreArray(xx,&x);
466: VecRestoreArray(zz,&zarray);
467: PetscLogFlops(50*a->nz - 5*mbs);
468: return(0);
469: }
474: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
475: {
476: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
477: PetscScalar *x,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
478: PetscScalar x1,x2,x3,x4,x5,x6,*zarray;
479: MatScalar *v;
481: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
482: PetscTruth usecprow=a->compressedrow.use;
485: VecGetArray(xx,&x);
486: VecGetArray(zz,&zarray);
488: idx = a->j;
489: v = a->a;
490: if (usecprow){
491: mbs = a->compressedrow.nrows;
492: ii = a->compressedrow.i;
493: ridx = a->compressedrow.rindex;
494: } else {
495: mbs = a->mbs;
496: ii = a->i;
497: z = zarray;
498: }
500: for (i=0; i<mbs; i++) {
501: n = ii[1] - ii[0]; ii++;
502: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
503: for (j=0; j<n; j++) {
504: xb = x + 6*(*idx++);
505: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
506: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
507: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
508: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
509: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
510: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
511: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
512: v += 36;
513: }
514: if (usecprow) z = zarray + 6*ridx[i];
515: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
516: if (!usecprow) z += 6;
517: }
519: VecRestoreArray(xx,&x);
520: VecRestoreArray(zz,&zarray);
521: PetscLogFlops(72*a->nz - 6*mbs);
522: return(0);
523: }
526: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
527: {
528: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
529: PetscScalar *x,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
530: PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray;
531: MatScalar *v;
533: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
534: PetscTruth usecprow=a->compressedrow.use;
537: VecGetArray(xx,&x);
538: VecGetArray(zz,&zarray);
540: idx = a->j;
541: v = a->a;
542: if (usecprow){
543: mbs = a->compressedrow.nrows;
544: ii = a->compressedrow.i;
545: ridx = a->compressedrow.rindex;
546: } else {
547: mbs = a->mbs;
548: ii = a->i;
549: z = zarray;
550: }
552: for (i=0; i<mbs; i++) {
553: n = ii[1] - ii[0]; ii++;
554: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
555: for (j=0; j<n; j++) {
556: xb = x + 7*(*idx++);
557: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
558: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
559: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
560: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
561: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
562: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
563: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
564: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
565: v += 49;
566: }
567: if (usecprow) z = zarray + 7*ridx[i];
568: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
569: if (!usecprow) z += 7;
570: }
572: VecRestoreArray(xx,&x);
573: VecRestoreArray(zz,&zarray);
574: PetscLogFlops(98*a->nz - 7*mbs);
575: return(0);
576: }
578: /*
579: This will not work with MatScalar == float because it calls the BLAS
580: */
583: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
584: {
585: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
586: PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray;
587: MatScalar *v;
589: PetscInt mbs=a->mbs,i,*idx,*ii,bs=A->rmap.bs,j,n,bs2=a->bs2;
590: PetscInt ncols,k,*ridx=PETSC_NULL;
591: PetscTruth usecprow=a->compressedrow.use;
594: VecGetArray(xx,&x);
595: VecGetArray(zz,&zarray);
597: idx = a->j;
598: v = a->a;
599: if (usecprow){
600: mbs = a->compressedrow.nrows;
601: ii = a->compressedrow.i;
602: ridx = a->compressedrow.rindex;
603: } else {
604: mbs = a->mbs;
605: ii = a->i;
606: z = zarray;
607: }
609: if (!a->mult_work) {
610: k = PetscMax(A->rmap.n,A->cmap.n);
611: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
612: }
613: work = a->mult_work;
614: for (i=0; i<mbs; i++) {
615: n = ii[1] - ii[0]; ii++;
616: ncols = n*bs;
617: workt = work;
618: for (j=0; j<n; j++) {
619: xb = x + bs*(*idx++);
620: for (k=0; k<bs; k++) workt[k] = xb[k];
621: workt += bs;
622: }
623: if (usecprow) z = zarray + bs*ridx[i];
624: Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
625: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
626: v += n*bs2;
627: if (!usecprow) z += bs;
628: }
629: VecRestoreArray(xx,&x);
630: VecRestoreArray(zz,&zarray);
631: PetscLogFlops(2*a->nz*bs2 - bs*mbs);
632: return(0);
633: }
638: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
639: {
640: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
641: PetscScalar *x,*y = 0,*z = 0,sum,*yarray,*zarray;
642: MatScalar *v;
644: PetscInt mbs=a->mbs,i,*idx,*ii,n,*ridx=PETSC_NULL;
645: PetscTruth usecprow=a->compressedrow.use;
648: VecGetArray(xx,&x);
649: VecGetArray(yy,&yarray);
650: if (zz != yy) {
651: VecGetArray(zz,&zarray);
652: } else {
653: zarray = yarray;
654: }
656: idx = a->j;
657: v = a->a;
658: if (usecprow){
659: if (zz != yy){
660: PetscMemcpy(zarray,yarray,mbs*sizeof(PetscScalar));
661: }
662: mbs = a->compressedrow.nrows;
663: ii = a->compressedrow.i;
664: ridx = a->compressedrow.rindex;
665: } else {
666: ii = a->i;
667: y = yarray;
668: z = zarray;
669: }
671: for (i=0; i<mbs; i++) {
672: n = ii[1] - ii[0]; ii++;
673: if (usecprow){
674: z = zarray + ridx[i];
675: y = yarray + ridx[i];
676: }
677: sum = y[0];
678: while (n--) sum += *v++ * x[*idx++];
679: z[0] = sum;
680: if (!usecprow){
681: z++; y++;
682: }
683: }
684: VecRestoreArray(xx,&x);
685: VecRestoreArray(yy,&yarray);
686: if (zz != yy) {
687: VecRestoreArray(zz,&zarray);
688: }
689: PetscLogFlops(2*a->nz);
690: return(0);
691: }
695: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
696: {
697: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
698: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2;
699: PetscScalar x1,x2,*yarray,*zarray;
700: MatScalar *v;
702: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
703: PetscTruth usecprow=a->compressedrow.use;
706: VecGetArray(xx,&x);
707: VecGetArray(yy,&yarray);
708: if (zz != yy) {
709: VecGetArray(zz,&zarray);
710: } else {
711: zarray = yarray;
712: }
714: idx = a->j;
715: v = a->a;
716: if (usecprow){
717: if (zz != yy){
718: PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));
719: }
720: mbs = a->compressedrow.nrows;
721: ii = a->compressedrow.i;
722: ridx = a->compressedrow.rindex;
723: if (zz != yy){
724: PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));
725: }
726: } else {
727: ii = a->i;
728: y = yarray;
729: z = zarray;
730: }
732: for (i=0; i<mbs; i++) {
733: n = ii[1] - ii[0]; ii++;
734: if (usecprow){
735: z = zarray + 2*ridx[i];
736: y = yarray + 2*ridx[i];
737: }
738: sum1 = y[0]; sum2 = y[1];
739: for (j=0; j<n; j++) {
740: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
741: sum1 += v[0]*x1 + v[2]*x2;
742: sum2 += v[1]*x1 + v[3]*x2;
743: v += 4;
744: }
745: z[0] = sum1; z[1] = sum2;
746: if (!usecprow){
747: z += 2; y += 2;
748: }
749: }
750: VecRestoreArray(xx,&x);
751: VecRestoreArray(yy,&yarray);
752: if (zz != yy) {
753: VecRestoreArray(zz,&zarray);
754: }
755: PetscLogFlops(4*a->nz);
756: return(0);
757: }
761: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
762: {
763: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
764: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
765: MatScalar *v;
767: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
768: PetscTruth usecprow=a->compressedrow.use;
771: VecGetArray(xx,&x);
772: VecGetArray(yy,&yarray);
773: if (zz != yy) {
774: VecGetArray(zz,&zarray);
775: } else {
776: zarray = yarray;
777: }
779: idx = a->j;
780: v = a->a;
781: if (usecprow){
782: if (zz != yy){
783: PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));
784: }
785: mbs = a->compressedrow.nrows;
786: ii = a->compressedrow.i;
787: ridx = a->compressedrow.rindex;
788: } else {
789: ii = a->i;
790: y = yarray;
791: z = zarray;
792: }
794: for (i=0; i<mbs; i++) {
795: n = ii[1] - ii[0]; ii++;
796: if (usecprow){
797: z = zarray + 3*ridx[i];
798: y = yarray + 3*ridx[i];
799: }
800: sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
801: for (j=0; j<n; j++) {
802: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
803: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
804: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
805: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
806: v += 9;
807: }
808: z[0] = sum1; z[1] = sum2; z[2] = sum3;
809: if (!usecprow){
810: z += 3; y += 3;
811: }
812: }
813: VecRestoreArray(xx,&x);
814: VecRestoreArray(yy,&yarray);
815: if (zz != yy) {
816: VecRestoreArray(zz,&zarray);
817: }
818: PetscLogFlops(18*a->nz);
819: return(0);
820: }
824: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
825: {
826: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
827: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
828: MatScalar *v;
830: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
831: PetscTruth usecprow=a->compressedrow.use;
834: VecGetArray(xx,&x);
835: VecGetArray(yy,&yarray);
836: if (zz != yy) {
837: VecGetArray(zz,&zarray);
838: } else {
839: zarray = yarray;
840: }
842: idx = a->j;
843: v = a->a;
844: if (usecprow){
845: if (zz != yy){
846: PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));
847: }
848: mbs = a->compressedrow.nrows;
849: ii = a->compressedrow.i;
850: ridx = a->compressedrow.rindex;
851: } else {
852: ii = a->i;
853: y = yarray;
854: z = zarray;
855: }
857: for (i=0; i<mbs; i++) {
858: n = ii[1] - ii[0]; ii++;
859: if (usecprow){
860: z = zarray + 4*ridx[i];
861: y = yarray + 4*ridx[i];
862: }
863: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
864: for (j=0; j<n; j++) {
865: xb = x + 4*(*idx++);
866: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
867: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
868: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
869: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
870: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
871: v += 16;
872: }
873: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
874: if (!usecprow){
875: z += 4; y += 4;
876: }
877: }
878: VecRestoreArray(xx,&x);
879: VecRestoreArray(yy,&yarray);
880: if (zz != yy) {
881: VecRestoreArray(zz,&zarray);
882: }
883: PetscLogFlops(32*a->nz);
884: return(0);
885: }
889: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
890: {
891: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
892: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
893: PetscScalar *yarray,*zarray;
894: MatScalar *v;
896: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
897: PetscTruth usecprow=a->compressedrow.use;
900: VecGetArray(xx,&x);
901: VecGetArray(yy,&yarray);
902: if (zz != yy) {
903: VecGetArray(zz,&zarray);
904: } else {
905: zarray = yarray;
906: }
908: idx = a->j;
909: v = a->a;
910: if (usecprow){
911: if (zz != yy){
912: PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));
913: }
914: mbs = a->compressedrow.nrows;
915: ii = a->compressedrow.i;
916: ridx = a->compressedrow.rindex;
917: } else {
918: ii = a->i;
919: y = yarray;
920: z = zarray;
921: }
923: for (i=0; i<mbs; i++) {
924: n = ii[1] - ii[0]; ii++;
925: if (usecprow){
926: z = zarray + 5*ridx[i];
927: y = yarray + 5*ridx[i];
928: }
929: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
930: for (j=0; j<n; j++) {
931: xb = x + 5*(*idx++);
932: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
933: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
934: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
935: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
936: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
937: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
938: v += 25;
939: }
940: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
941: if (!usecprow){
942: z += 5; y += 5;
943: }
944: }
945: VecRestoreArray(xx,&x);
946: VecRestoreArray(yy,&yarray);
947: if (zz != yy) {
948: VecRestoreArray(zz,&zarray);
949: }
950: PetscLogFlops(50*a->nz);
951: return(0);
952: }
955: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
956: {
957: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
958: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
959: PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray;
960: MatScalar *v;
962: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
963: PetscTruth usecprow=a->compressedrow.use;
966: VecGetArray(xx,&x);
967: VecGetArray(yy,&yarray);
968: if (zz != yy) {
969: VecGetArray(zz,&zarray);
970: } else {
971: zarray = yarray;
972: }
974: idx = a->j;
975: v = a->a;
976: if (usecprow){
977: if (zz != yy){
978: PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));
979: }
980: mbs = a->compressedrow.nrows;
981: ii = a->compressedrow.i;
982: ridx = a->compressedrow.rindex;
983: } else {
984: ii = a->i;
985: y = yarray;
986: z = zarray;
987: }
989: for (i=0; i<mbs; i++) {
990: n = ii[1] - ii[0]; ii++;
991: if (usecprow){
992: z = zarray + 6*ridx[i];
993: y = yarray + 6*ridx[i];
994: }
995: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
996: for (j=0; j<n; j++) {
997: xb = x + 6*(*idx++);
998: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
999: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1000: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1001: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1002: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1003: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1004: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1005: v += 36;
1006: }
1007: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1008: if (!usecprow){
1009: z += 6; y += 6;
1010: }
1011: }
1012: VecRestoreArray(xx,&x);
1013: VecRestoreArray(yy,&yarray);
1014: if (zz != yy) {
1015: VecRestoreArray(zz,&zarray);
1016: }
1017: PetscLogFlops(72*a->nz);
1018: return(0);
1019: }
1023: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1024: {
1025: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1026: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1027: PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1028: MatScalar *v;
1030: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1031: PetscTruth usecprow=a->compressedrow.use;
1034: VecGetArray(xx,&x);
1035: VecGetArray(yy,&yarray);
1036: if (zz != yy) {
1037: VecGetArray(zz,&zarray);
1038: } else {
1039: zarray = yarray;
1040: }
1042: idx = a->j;
1043: v = a->a;
1044: if (usecprow){
1045: if (zz != yy){
1046: PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
1047: }
1048: mbs = a->compressedrow.nrows;
1049: ii = a->compressedrow.i;
1050: ridx = a->compressedrow.rindex;
1051: } else {
1052: ii = a->i;
1053: y = yarray;
1054: z = zarray;
1055: }
1057: for (i=0; i<mbs; i++) {
1058: n = ii[1] - ii[0]; ii++;
1059: if (usecprow){
1060: z = zarray + 7*ridx[i];
1061: y = yarray + 7*ridx[i];
1062: }
1063: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1064: for (j=0; j<n; j++) {
1065: xb = x + 7*(*idx++);
1066: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1067: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1068: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1069: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1070: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1071: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1072: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1073: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1074: v += 49;
1075: }
1076: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1077: if (!usecprow){
1078: z += 7; y += 7;
1079: }
1080: }
1081: VecRestoreArray(xx,&x);
1082: VecRestoreArray(yy,&yarray);
1083: if (zz != yy) {
1084: VecRestoreArray(zz,&zarray);
1085: }
1086: PetscLogFlops(98*a->nz);
1087: return(0);
1088: }
1092: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1093: {
1094: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1095: PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray;
1096: MatScalar *v;
1098: PetscInt mbs,i,*idx,*ii,bs=A->rmap.bs,j,n,bs2=a->bs2;
1099: PetscInt ncols,k,*ridx=PETSC_NULL;
1100: PetscTruth usecprow=a->compressedrow.use;
1103: VecCopy_Seq(yy,zz);
1104: VecGetArray(xx,&x);
1105: VecGetArray(zz,&zarray);
1107: idx = a->j;
1108: v = a->a;
1109: if (usecprow){
1110: mbs = a->compressedrow.nrows;
1111: ii = a->compressedrow.i;
1112: ridx = a->compressedrow.rindex;
1113: } else {
1114: mbs = a->mbs;
1115: ii = a->i;
1116: z = zarray;
1117: }
1119: if (!a->mult_work) {
1120: k = PetscMax(A->rmap.n,A->cmap.n);
1121: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1122: }
1123: work = a->mult_work;
1124: for (i=0; i<mbs; i++) {
1125: n = ii[1] - ii[0]; ii++;
1126: ncols = n*bs;
1127: workt = work;
1128: for (j=0; j<n; j++) {
1129: xb = x + bs*(*idx++);
1130: for (k=0; k<bs; k++) workt[k] = xb[k];
1131: workt += bs;
1132: }
1133: if (usecprow) z = zarray + bs*ridx[i];
1134: Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1135: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1136: v += n*bs2;
1137: if (!usecprow){
1138: z += bs;
1139: }
1140: }
1141: VecRestoreArray(xx,&x);
1142: VecRestoreArray(zz,&zarray);
1143: PetscLogFlops(2*a->nz*bs2);
1144: return(0);
1145: }
1149: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1150: {
1151: PetscScalar zero = 0.0;
1153: #if defined (PETSC_USE_LOG)
1154: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1155: #endif
1158: VecSet(zz,zero);
1159: MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1161: PetscLogFlops(2*a->nz*a->bs2 - A->cmap.n);
1162: return(0);
1163: }
1167: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1169: {
1170: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1171: PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1172: MatScalar *v;
1174: PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap.bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1175: Mat_CompressedRow cprow = a->compressedrow;
1176: PetscTruth usecprow=cprow.use;
1179: if (yy != zz) { VecCopy_Seq(yy,zz); }
1180: VecGetArray(xx,&x);
1181: VecGetArray(zz,&z);
1183: idx = a->j;
1184: v = a->a;
1185: if (usecprow){
1186: mbs = cprow.nrows;
1187: ii = cprow.i;
1188: ridx = cprow.rindex;
1189: } else {
1190: mbs=a->mbs;
1191: ii = a->i;
1192: xb = x;
1193: }
1195: switch (bs) {
1196: case 1:
1197: for (i=0; i<mbs; i++) {
1198: if (usecprow) xb = x + ridx[i];
1199: x1 = xb[0];
1200: ib = idx + ii[0];
1201: n = ii[1] - ii[0]; ii++;
1202: for (j=0; j<n; j++) {
1203: rval = ib[j];
1204: z[rval] += *v * x1;
1205: v++;
1206: }
1207: if (!usecprow) xb++;
1208: }
1209: break;
1210: case 2:
1211: for (i=0; i<mbs; i++) {
1212: if (usecprow) xb = x + 2*ridx[i];
1213: x1 = xb[0]; x2 = xb[1];
1214: ib = idx + ii[0];
1215: n = ii[1] - ii[0]; ii++;
1216: for (j=0; j<n; j++) {
1217: rval = ib[j]*2;
1218: z[rval++] += v[0]*x1 + v[1]*x2;
1219: z[rval++] += v[2]*x1 + v[3]*x2;
1220: v += 4;
1221: }
1222: if (!usecprow) xb += 2;
1223: }
1224: break;
1225: case 3:
1226: for (i=0; i<mbs; i++) {
1227: if (usecprow) xb = x + 3*ridx[i];
1228: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1229: ib = idx + ii[0];
1230: n = ii[1] - ii[0]; ii++;
1231: for (j=0; j<n; j++) {
1232: rval = ib[j]*3;
1233: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1234: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1235: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1236: v += 9;
1237: }
1238: if (!usecprow) xb += 3;
1239: }
1240: break;
1241: case 4:
1242: for (i=0; i<mbs; i++) {
1243: if (usecprow) xb = x + 4*ridx[i];
1244: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1245: ib = idx + ii[0];
1246: n = ii[1] - ii[0]; ii++;
1247: for (j=0; j<n; j++) {
1248: rval = ib[j]*4;
1249: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
1250: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
1251: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
1252: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1253: v += 16;
1254: }
1255: if (!usecprow) xb += 4;
1256: }
1257: break;
1258: case 5:
1259: for (i=0; i<mbs; i++) {
1260: if (usecprow) xb = x + 5*ridx[i];
1261: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1262: x4 = xb[3]; x5 = xb[4];
1263: ib = idx + ii[0];
1264: n = ii[1] - ii[0]; ii++;
1265: for (j=0; j<n; j++) {
1266: rval = ib[j]*5;
1267: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
1268: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
1269: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1270: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1271: z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1272: v += 25;
1273: }
1274: if (!usecprow) xb += 5;
1275: }
1276: break;
1277: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
1278: PetscInt ncols,k;
1279: PetscScalar *work,*workt,*xtmp;
1281: if (!a->mult_work) {
1282: k = PetscMax(A->rmap.n,A->cmap.n);
1283: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1284: }
1285: work = a->mult_work;
1286: xtmp = x;
1287: for (i=0; i<mbs; i++) {
1288: n = ii[1] - ii[0]; ii++;
1289: ncols = n*bs;
1290: PetscMemzero(work,ncols*sizeof(PetscScalar));
1291: if (usecprow) {
1292: xtmp = x + bs*ridx[i];
1293: }
1294: Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1295: /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1296: v += n*bs2;
1297: if (!usecprow) xtmp += bs;
1298: workt = work;
1299: for (j=0; j<n; j++) {
1300: zb = z + bs*(*idx++);
1301: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1302: workt += bs;
1303: }
1304: }
1305: }
1306: }
1307: VecRestoreArray(xx,&x);
1308: VecRestoreArray(zz,&z);
1309: PetscLogFlops(2*a->nz*a->bs2);
1310: return(0);
1311: }
1315: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1316: {
1317: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
1318: PetscInt totalnz = a->bs2*a->nz;
1319: PetscScalar oalpha = alpha;
1321: #if defined(PETSC_USE_MAT_SINGLE)
1322: PetscInt i;
1323: #else
1324: PetscBLASInt tnz = (PetscBLASInt) totalnz,one = 1;
1325: #endif
1328: #if defined(PETSC_USE_MAT_SINGLE)
1329: for (i=0; i<totalnz; i++) a->a[i] *= oalpha;
1330: #else
1331: BLASscal_(&tnz,&oalpha,a->a,&one);
1332: #endif
1333: PetscLogFlops(totalnz);
1334: return(0);
1335: }
1339: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1340: {
1342: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1343: MatScalar *v = a->a;
1344: PetscReal sum = 0.0;
1345: PetscInt i,j,k,bs=A->rmap.bs,nz=a->nz,bs2=a->bs2,k1;
1348: if (type == NORM_FROBENIUS) {
1349: for (i=0; i< bs2*nz; i++) {
1350: #if defined(PETSC_USE_COMPLEX)
1351: sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1352: #else
1353: sum += (*v)*(*v); v++;
1354: #endif
1355: }
1356: *norm = sqrt(sum);
1357: } else if (type == NORM_1) { /* maximum column sum */
1358: PetscReal *tmp;
1359: PetscInt *bcol = a->j;
1360: PetscMalloc((A->cmap.n+1)*sizeof(PetscReal),&tmp);
1361: PetscMemzero(tmp,A->cmap.n*sizeof(PetscReal));
1362: for (i=0; i<nz; i++){
1363: for (j=0; j<bs; j++){
1364: k1 = bs*(*bcol) + j; /* column index */
1365: for (k=0; k<bs; k++){
1366: tmp[k1] += PetscAbsScalar(*v); v++;
1367: }
1368: }
1369: bcol++;
1370: }
1371: *norm = 0.0;
1372: for (j=0; j<A->cmap.n; j++) {
1373: if (tmp[j] > *norm) *norm = tmp[j];
1374: }
1375: PetscFree(tmp);
1376: } else if (type == NORM_INFINITY) { /* maximum row sum */
1377: *norm = 0.0;
1378: for (k=0; k<bs; k++) {
1379: for (j=0; j<a->mbs; j++) {
1380: v = a->a + bs2*a->i[j] + k;
1381: sum = 0.0;
1382: for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1383: for (k1=0; k1<bs; k1++){
1384: sum += PetscAbsScalar(*v);
1385: v += bs;
1386: }
1387: }
1388: if (sum > *norm) *norm = sum;
1389: }
1390: }
1391: } else {
1392: SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1393: }
1394: return(0);
1395: }
1400: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1401: {
1402: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;
1406: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1407: if ((A->rmap.N != B->rmap.N) || (A->cmap.n != B->cmap.n) || (A->rmap.bs != B->rmap.bs)|| (a->nz != b->nz)) {
1408: *flg = PETSC_FALSE;
1409: return(0);
1410: }
1411:
1412: /* if the a->i are the same */
1413: PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
1414: if (!*flg) {
1415: return(0);
1416: }
1417:
1418: /* if a->j are the same */
1419: PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
1420: if (!*flg) {
1421: return(0);
1422: }
1423: /* if a->a are the same */
1424: PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap.bs)*(B->rmap.bs)*sizeof(PetscScalar),flg);
1425: return(0);
1426:
1427: }
1431: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1432: {
1433: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1435: PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1436: PetscScalar *x,zero = 0.0;
1437: MatScalar *aa,*aa_j;
1440: if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1441: bs = A->rmap.bs;
1442: aa = a->a;
1443: ai = a->i;
1444: aj = a->j;
1445: ambs = a->mbs;
1446: bs2 = a->bs2;
1448: VecSet(v,zero);
1449: VecGetArray(v,&x);
1450: VecGetLocalSize(v,&n);
1451: if (n != A->rmap.N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1452: for (i=0; i<ambs; i++) {
1453: for (j=ai[i]; j<ai[i+1]; j++) {
1454: if (aj[j] == i) {
1455: row = i*bs;
1456: aa_j = aa+j*bs2;
1457: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1458: break;
1459: }
1460: }
1461: }
1462: VecRestoreArray(v,&x);
1463: return(0);
1464: }
1468: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1469: {
1470: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1471: PetscScalar *l,*r,x,*li,*ri;
1472: MatScalar *aa,*v;
1474: PetscInt i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1477: ai = a->i;
1478: aj = a->j;
1479: aa = a->a;
1480: m = A->rmap.n;
1481: n = A->cmap.n;
1482: bs = A->rmap.bs;
1483: mbs = a->mbs;
1484: bs2 = a->bs2;
1485: if (ll) {
1486: VecGetArray(ll,&l);
1487: VecGetLocalSize(ll,&lm);
1488: if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1489: for (i=0; i<mbs; i++) { /* for each block row */
1490: M = ai[i+1] - ai[i];
1491: li = l + i*bs;
1492: v = aa + bs2*ai[i];
1493: for (j=0; j<M; j++) { /* for each block */
1494: for (k=0; k<bs2; k++) {
1495: (*v++) *= li[k%bs];
1496: }
1497: }
1498: }
1499: VecRestoreArray(ll,&l);
1500: PetscLogFlops(a->nz);
1501: }
1502:
1503: if (rr) {
1504: VecGetArray(rr,&r);
1505: VecGetLocalSize(rr,&rn);
1506: if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1507: for (i=0; i<mbs; i++) { /* for each block row */
1508: M = ai[i+1] - ai[i];
1509: v = aa + bs2*ai[i];
1510: for (j=0; j<M; j++) { /* for each block */
1511: ri = r + bs*aj[ai[i]+j];
1512: for (k=0; k<bs; k++) {
1513: x = ri[k];
1514: for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1515: }
1516: }
1517: }
1518: VecRestoreArray(rr,&r);
1519: PetscLogFlops(a->nz);
1520: }
1521: return(0);
1522: }
1527: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1528: {
1529: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1532: info->rows_global = (double)A->rmap.N;
1533: info->columns_global = (double)A->cmap.n;
1534: info->rows_local = (double)A->rmap.n;
1535: info->columns_local = (double)A->cmap.n;
1536: info->block_size = a->bs2;
1537: info->nz_allocated = a->maxnz;
1538: info->nz_used = a->bs2*a->nz;
1539: info->nz_unneeded = (double)(info->nz_allocated - info->nz_used);
1540: info->assemblies = A->num_ass;
1541: info->mallocs = a->reallocs;
1542: info->memory = A->mem;
1543: if (A->factor) {
1544: info->fill_ratio_given = A->info.fill_ratio_given;
1545: info->fill_ratio_needed = A->info.fill_ratio_needed;
1546: info->factor_mallocs = A->info.factor_mallocs;
1547: } else {
1548: info->fill_ratio_given = 0;
1549: info->fill_ratio_needed = 0;
1550: info->factor_mallocs = 0;
1551: }
1552: return(0);
1553: }
1558: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
1559: {
1560: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1564: PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1565: return(0);
1566: }