Actual source code: inode.c

  1: #define PETSCMAT_DLL

  3: /*
  4:   This file provides high performance routines for the Inode format (compressed sparse row)
  5:   by taking advantage of rows with identical nonzero structure (I-nodes).
  6: */
 7:  #include src/mat/impls/aij/seq/aij.h

 11: static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
 12: {
 13:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
 15:   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;

 18:   n      = A->cmap.n;
 19:   m      = A->rmap.n;
 20:   ns_row = a->inode.size;
 21: 
 22:   min_mn = (m < n) ? m : n;
 23:   if (!ns) {
 24:     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
 25:     for(; count+1 < n; count++,i++);
 26:     if (count < n)  {
 27:       i++;
 28:     }
 29:     *size = i;
 30:     return(0);
 31:   }
 32:   PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);
 33: 
 34:   /* Use the same row structure wherever feasible. */
 35:   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
 36:     ns_col[i] = ns_row[i];
 37:   }

 39:   /* if m < n; pad up the remainder with inode_limit */
 40:   for(; count+1 < n; count++,i++) {
 41:     ns_col[i] = 1;
 42:   }
 43:   /* The last node is the odd ball. padd it up with the remaining rows; */
 44:   if (count < n)  {
 45:     ns_col[i] = n - count;
 46:     i++;
 47:   } else if (count > n) {
 48:     /* Adjust for the over estimation */
 49:     ns_col[i-1] += n - count;
 50:   }
 51:   *size = i;
 52:   *ns   = ns_col;
 53:   return(0);
 54: }


 57: /*
 58:       This builds symmetric version of nonzero structure,
 59: */
 62: static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
 63: {
 64:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
 66:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
 67:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;

 70:   nslim_row = a->inode.node_count;
 71:   m         = A->rmap.n;
 72:   n         = A->cmap.n;
 73:   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square");
 74: 
 75:   /* Use the row_inode as column_inode */
 76:   nslim_col = nslim_row;
 77:   ns_col    = ns_row;

 79:   /* allocate space for reformated inode structure */
 80:   PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);
 81:   PetscMalloc((n+1)*sizeof(PetscInt),&tvc);
 82:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];

 84:   for (i1=0,col=0; i1<nslim_col; ++i1){
 85:     nsz = ns_col[i1];
 86:     for (i2=0; i2<nsz; ++i2,++col)
 87:       tvc[col] = i1;
 88:   }
 89:   /* allocate space for row pointers */
 90:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
 91:   *iia = ia;
 92:   PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
 93:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);

 95:   /* determine the number of columns in each row */
 96:   ia[0] = oshift;
 97:   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {

 99:     j    = aj + ai[row] + ishift;
100:     jmax = aj + ai[row+1] + ishift;
101:     i2   = 0;
102:     col  = *j++ + ishift;
103:     i2   = tvc[col];
104:     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105:       ia[i1+1]++;
106:       ia[i2+1]++;
107:       i2++;                     /* Start col of next node */
108:       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109:       i2 = tvc[col];
110:     }
111:     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112:   }

114:   /* shift ia[i] to point to next row */
115:   for (i1=1; i1<nslim_row+1; i1++) {
116:     row        = ia[i1-1];
117:     ia[i1]    += row;
118:     work[i1-1] = row - oshift;
119:   }

121:   /* allocate space for column pointers */
122:   nz   = ia[nslim_row] + (!ishift);
123:   PetscMalloc(nz*sizeof(PetscInt),&ja);
124:   *jja = ja;

126:  /* loop over lower triangular part putting into ja */
127:   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128:     j    = aj + ai[row] + ishift;
129:     jmax = aj + ai[row+1] + ishift;
130:     i2   = 0;                     /* Col inode index */
131:     col  = *j++ + ishift;
132:     i2   = tvc[col];
133:     while (i2<i1 && j<jmax) {
134:       ja[work[i2]++] = i1 + oshift;
135:       ja[work[i1]++] = i2 + oshift;
136:       ++i2;
137:       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138:       i2 = tvc[col];
139:     }
140:     if (i2 == i1) ja[work[i1]++] = i2 + oshift;

142:   }
143:   PetscFree(work);
144:   PetscFree(tns);
145:   PetscFree(tvc);
146:   return(0);
147: }

149: /*
150:       This builds nonsymmetric version of nonzero structure,
151: */
154: static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155: {
156:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
158:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

162:   nslim_row = a->inode.node_count;
163:   n         = A->cmap.n;

165:   /* Create The column_inode for this matrix */
166:   Mat_CreateColInode(A,&nslim_col,&ns_col);
167: 
168:   /* allocate space for reformated column_inode structure */
169:   PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);
170:   PetscMalloc((n +1)*sizeof(PetscInt),&tvc);
171:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

173:   for (i1=0,col=0; i1<nslim_col; ++i1){
174:     nsz = ns_col[i1];
175:     for (i2=0; i2<nsz; ++i2,++col)
176:       tvc[col] = i1;
177:   }
178:   /* allocate space for row pointers */
179:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
180:   *iia = ia;
181:   PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
182:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);

184:   /* determine the number of columns in each row */
185:   ia[0] = oshift;
186:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187:     j   = aj + ai[row] + ishift;
188:     col = *j++ + ishift;
189:     i2  = tvc[col];
190:     nz  = ai[row+1] - ai[row];
191:     while (nz-- > 0) {           /* off-diagonal elemets */
192:       ia[i1+1]++;
193:       i2++;                     /* Start col of next node */
194:       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195:       if (nz > 0) i2 = tvc[col];
196:     }
197:   }

199:   /* shift ia[i] to point to next row */
200:   for (i1=1; i1<nslim_row+1; i1++) {
201:     row        = ia[i1-1];
202:     ia[i1]    += row;
203:     work[i1-1] = row - oshift;
204:   }

206:   /* allocate space for column pointers */
207:   nz   = ia[nslim_row] + (!ishift);
208:   PetscMalloc(nz*sizeof(PetscInt),&ja);
209:   *jja = ja;

211:  /* loop over matrix putting into ja */
212:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213:     j   = aj + ai[row] + ishift;
214:     i2  = 0;                     /* Col inode index */
215:     col = *j++ + ishift;
216:     i2  = tvc[col];
217:     nz  = ai[row+1] - ai[row];
218:     while (nz-- > 0) {
219:       ja[work[i1]++] = i2 + oshift;
220:       ++i2;
221:       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222:       if (nz > 0) i2 = tvc[col];
223:     }
224:   }
225:   PetscFree(ns_col);
226:   PetscFree(work);
227:   PetscFree(tns);
228:   PetscFree(tvc);
229:   return(0);
230: }

234: static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235: {
236:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;

240:   *n     = a->inode.node_count;
241:   if (!ia) return(0);
242:   if (!blockcompressed) {
243:     MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
244:   } else if (symmetric) {
245:     MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);
246:   } else {
247:     MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
248:   }
249:   return(0);
250: }

254: static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255: {

259:   if (!ia) return(0);

261:   if (!blockcompressed) {
262:     MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
263:   } else {
264:     PetscFree(*ia);
265:     PetscFree(*ja);
266:   }

268:   return(0);
269: }

271: /* ----------------------------------------------------------- */

275: static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276: {
277:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
279:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

283:   nslim_row = a->inode.node_count;
284:   n         = A->cmap.n;

286:   /* Create The column_inode for this matrix */
287:   Mat_CreateColInode(A,&nslim_col,&ns_col);
288: 
289:   /* allocate space for reformated column_inode structure */
290:   PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);
291:   PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);
292:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

294:   for (i1=0,col=0; i1<nslim_col; ++i1){
295:     nsz = ns_col[i1];
296:     for (i2=0; i2<nsz; ++i2,++col)
297:       tvc[col] = i1;
298:   }
299:   /* allocate space for column pointers */
300:   PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);
301:   *iia = ia;
302:   PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
303:   PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);

305:   /* determine the number of columns in each row */
306:   ia[0] = oshift;
307:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308:     j   = aj + ai[row] + ishift;
309:     col = *j++ + ishift;
310:     i2  = tvc[col];
311:     nz  = ai[row+1] - ai[row];
312:     while (nz-- > 0) {           /* off-diagonal elemets */
313:       /* ia[i1+1]++; */
314:       ia[i2+1]++;
315:       i2++;
316:       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317:       if (nz > 0) i2 = tvc[col];
318:     }
319:   }

321:   /* shift ia[i] to point to next col */
322:   for (i1=1; i1<nslim_col+1; i1++) {
323:     col        = ia[i1-1];
324:     ia[i1]    += col;
325:     work[i1-1] = col - oshift;
326:   }

328:   /* allocate space for column pointers */
329:   nz   = ia[nslim_col] + (!ishift);
330:   PetscMalloc(nz*sizeof(PetscInt),&ja);
331:   *jja = ja;

333:  /* loop over matrix putting into ja */
334:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335:     j   = aj + ai[row] + ishift;
336:     i2  = 0;                     /* Col inode index */
337:     col = *j++ + ishift;
338:     i2  = tvc[col];
339:     nz  = ai[row+1] - ai[row];
340:     while (nz-- > 0) {
341:       /* ja[work[i1]++] = i2 + oshift; */
342:       ja[work[i2]++] = i1 + oshift;
343:       i2++;
344:       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345:       if (nz > 0) i2 = tvc[col];
346:     }
347:   }
348:   PetscFree(ns_col);
349:   PetscFree(work);
350:   PetscFree(tns);
351:   PetscFree(tvc);
352:   return(0);
353: }

357: static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358: {

362:   Mat_CreateColInode(A,n,PETSC_NULL);
363:   if (!ia) return(0);

365:   if (!blockcompressed) {
366:     MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
367:   } else if (symmetric) {
368:     /* Since the indices are symmetric it does'nt matter */
369:     MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);
370:   } else {
371:     MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
372:   }
373:   return(0);
374: }

378: static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379: {

383:   if (!ia) return(0);
384:   if (!blockcompressed) {
385:     MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
386:   } else {
387:     PetscFree(*ia);
388:     PetscFree(*ja);
389:   }
390:   return(0);
391: }

393: /* ----------------------------------------------------------- */

397: static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy)
398: {
399:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
400:   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401:   PetscScalar    *v1,*v2,*v3,*v4,*v5,*x,*y;
403:   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
404: 
405: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
406: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
407: #endif

410:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
411:   node_max = a->inode.node_count;
412:   ns       = a->inode.size;     /* Node Size array */
413:   VecGetArray(xx,&x);
414:   VecGetArray(yy,&y);
415:   idx  = a->j;
416:   v1   = a->a;
417:   ii   = a->i;

419:   for (i = 0,row = 0; i< node_max; ++i){
420:     nsz  = ns[i];
421:     n    = ii[1] - ii[0];
422:     ii  += nsz;
423:     sz   = n;                   /* No of non zeros in this row */
424:                                 /* Switch on the size of Node */
425:     switch (nsz){               /* Each loop in 'case' is unrolled */
426:     case 1 :
427:       sum1  = 0;
428: 
429:       for(n = 0; n< sz-1; n+=2) {
430:         i1   = idx[0];          /* The instructions are ordered to */
431:         i2   = idx[1];          /* make the compiler's job easy */
432:         idx += 2;
433:         tmp0 = x[i1];
434:         tmp1 = x[i2];
435:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
436:        }
437: 
438:       if (n == sz-1){          /* Take care of the last nonzero  */
439:         tmp0  = x[*idx++];
440:         sum1 += *v1++ * tmp0;
441:       }
442:       y[row++]=sum1;
443:       break;
444:     case 2:
445:       sum1  = 0;
446:       sum2  = 0;
447:       v2    = v1 + n;
448: 
449:       for (n = 0; n< sz-1; n+=2) {
450:         i1   = idx[0];
451:         i2   = idx[1];
452:         idx += 2;
453:         tmp0 = x[i1];
454:         tmp1 = x[i2];
455:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
456:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
457:       }
458:       if (n == sz-1){
459:         tmp0  = x[*idx++];
460:         sum1 += *v1++ * tmp0;
461:         sum2 += *v2++ * tmp0;
462:       }
463:       y[row++]=sum1;
464:       y[row++]=sum2;
465:       v1      =v2;              /* Since the next block to be processed starts there*/
466:       idx    +=sz;
467:       break;
468:     case 3:
469:       sum1  = 0;
470:       sum2  = 0;
471:       sum3  = 0;
472:       v2    = v1 + n;
473:       v3    = v2 + n;
474: 
475:       for (n = 0; n< sz-1; n+=2) {
476:         i1   = idx[0];
477:         i2   = idx[1];
478:         idx += 2;
479:         tmp0 = x[i1];
480:         tmp1 = x[i2];
481:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
482:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
483:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
484:       }
485:       if (n == sz-1){
486:         tmp0  = x[*idx++];
487:         sum1 += *v1++ * tmp0;
488:         sum2 += *v2++ * tmp0;
489:         sum3 += *v3++ * tmp0;
490:       }
491:       y[row++]=sum1;
492:       y[row++]=sum2;
493:       y[row++]=sum3;
494:       v1       =v3;             /* Since the next block to be processed starts there*/
495:       idx     +=2*sz;
496:       break;
497:     case 4:
498:       sum1  = 0;
499:       sum2  = 0;
500:       sum3  = 0;
501:       sum4  = 0;
502:       v2    = v1 + n;
503:       v3    = v2 + n;
504:       v4    = v3 + n;
505: 
506:       for (n = 0; n< sz-1; n+=2) {
507:         i1   = idx[0];
508:         i2   = idx[1];
509:         idx += 2;
510:         tmp0 = x[i1];
511:         tmp1 = x[i2];
512:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
513:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
514:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
515:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
516:       }
517:       if (n == sz-1){
518:         tmp0  = x[*idx++];
519:         sum1 += *v1++ * tmp0;
520:         sum2 += *v2++ * tmp0;
521:         sum3 += *v3++ * tmp0;
522:         sum4 += *v4++ * tmp0;
523:       }
524:       y[row++]=sum1;
525:       y[row++]=sum2;
526:       y[row++]=sum3;
527:       y[row++]=sum4;
528:       v1      =v4;              /* Since the next block to be processed starts there*/
529:       idx    +=3*sz;
530:       break;
531:     case 5:
532:       sum1  = 0;
533:       sum2  = 0;
534:       sum3  = 0;
535:       sum4  = 0;
536:       sum5  = 0;
537:       v2    = v1 + n;
538:       v3    = v2 + n;
539:       v4    = v3 + n;
540:       v5    = v4 + n;
541: 
542:       for (n = 0; n<sz-1; n+=2) {
543:         i1   = idx[0];
544:         i2   = idx[1];
545:         idx += 2;
546:         tmp0 = x[i1];
547:         tmp1 = x[i2];
548:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
549:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
550:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
551:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
552:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
553:       }
554:       if (n == sz-1){
555:         tmp0  = x[*idx++];
556:         sum1 += *v1++ * tmp0;
557:         sum2 += *v2++ * tmp0;
558:         sum3 += *v3++ * tmp0;
559:         sum4 += *v4++ * tmp0;
560:         sum5 += *v5++ * tmp0;
561:       }
562:       y[row++]=sum1;
563:       y[row++]=sum2;
564:       y[row++]=sum3;
565:       y[row++]=sum4;
566:       y[row++]=sum5;
567:       v1      =v5;       /* Since the next block to be processed starts there */
568:       idx    +=4*sz;
569:       break;
570:     default :
571:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
572:     }
573:   }
574:   VecRestoreArray(xx,&x);
575:   VecRestoreArray(yy,&y);
576:   PetscLogFlops(2*a->nz - A->rmap.n);
577:   return(0);
578: }
579: /* ----------------------------------------------------------- */
580: /* Almost same code as the MatMult_Inode() */
583: static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy)
584: {
585:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
586:   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
587:   PetscScalar    *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt;
589:   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
590: 
592:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
593:   node_max = a->inode.node_count;
594:   ns       = a->inode.size;     /* Node Size array */
595:   VecGetArray(xx,&x);
596:   VecGetArray(yy,&y);
597:   if (zz != yy) {
598:     VecGetArray(zz,&z);
599:   } else {
600:     z = y;
601:   }
602:   zt = z;

604:   idx  = a->j;
605:   v1   = a->a;
606:   ii   = a->i;

608:   for (i = 0,row = 0; i< node_max; ++i){
609:     nsz  = ns[i];
610:     n    = ii[1] - ii[0];
611:     ii  += nsz;
612:     sz   = n;                   /* No of non zeros in this row */
613:                                 /* Switch on the size of Node */
614:     switch (nsz){               /* Each loop in 'case' is unrolled */
615:     case 1 :
616:       sum1  = *zt++;
617: 
618:       for(n = 0; n< sz-1; n+=2) {
619:         i1   = idx[0];          /* The instructions are ordered to */
620:         i2   = idx[1];          /* make the compiler's job easy */
621:         idx += 2;
622:         tmp0 = x[i1];
623:         tmp1 = x[i2];
624:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
625:        }
626: 
627:       if(n   == sz-1){          /* Take care of the last nonzero  */
628:         tmp0  = x[*idx++];
629:         sum1 += *v1++ * tmp0;
630:       }
631:       y[row++]=sum1;
632:       break;
633:     case 2:
634:       sum1  = *zt++;
635:       sum2  = *zt++;
636:       v2    = v1 + n;
637: 
638:       for(n = 0; n< sz-1; n+=2) {
639:         i1   = idx[0];
640:         i2   = idx[1];
641:         idx += 2;
642:         tmp0 = x[i1];
643:         tmp1 = x[i2];
644:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
645:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
646:       }
647:       if(n   == sz-1){
648:         tmp0  = x[*idx++];
649:         sum1 += *v1++ * tmp0;
650:         sum2 += *v2++ * tmp0;
651:       }
652:       y[row++]=sum1;
653:       y[row++]=sum2;
654:       v1      =v2;              /* Since the next block to be processed starts there*/
655:       idx    +=sz;
656:       break;
657:     case 3:
658:       sum1  = *zt++;
659:       sum2  = *zt++;
660:       sum3  = *zt++;
661:       v2    = v1 + n;
662:       v3    = v2 + n;
663: 
664:       for (n = 0; n< sz-1; n+=2) {
665:         i1   = idx[0];
666:         i2   = idx[1];
667:         idx += 2;
668:         tmp0 = x[i1];
669:         tmp1 = x[i2];
670:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
671:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
672:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
673:       }
674:       if (n == sz-1){
675:         tmp0  = x[*idx++];
676:         sum1 += *v1++ * tmp0;
677:         sum2 += *v2++ * tmp0;
678:         sum3 += *v3++ * tmp0;
679:       }
680:       y[row++]=sum1;
681:       y[row++]=sum2;
682:       y[row++]=sum3;
683:       v1       =v3;             /* Since the next block to be processed starts there*/
684:       idx     +=2*sz;
685:       break;
686:     case 4:
687:       sum1  = *zt++;
688:       sum2  = *zt++;
689:       sum3  = *zt++;
690:       sum4  = *zt++;
691:       v2    = v1 + n;
692:       v3    = v2 + n;
693:       v4    = v3 + n;
694: 
695:       for (n = 0; n< sz-1; n+=2) {
696:         i1   = idx[0];
697:         i2   = idx[1];
698:         idx += 2;
699:         tmp0 = x[i1];
700:         tmp1 = x[i2];
701:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
702:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
703:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
704:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
705:       }
706:       if (n == sz-1){
707:         tmp0  = x[*idx++];
708:         sum1 += *v1++ * tmp0;
709:         sum2 += *v2++ * tmp0;
710:         sum3 += *v3++ * tmp0;
711:         sum4 += *v4++ * tmp0;
712:       }
713:       y[row++]=sum1;
714:       y[row++]=sum2;
715:       y[row++]=sum3;
716:       y[row++]=sum4;
717:       v1      =v4;              /* Since the next block to be processed starts there*/
718:       idx    +=3*sz;
719:       break;
720:     case 5:
721:       sum1  = *zt++;
722:       sum2  = *zt++;
723:       sum3  = *zt++;
724:       sum4  = *zt++;
725:       sum5  = *zt++;
726:       v2    = v1 + n;
727:       v3    = v2 + n;
728:       v4    = v3 + n;
729:       v5    = v4 + n;
730: 
731:       for (n = 0; n<sz-1; n+=2) {
732:         i1   = idx[0];
733:         i2   = idx[1];
734:         idx += 2;
735:         tmp0 = x[i1];
736:         tmp1 = x[i2];
737:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
738:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
739:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
740:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
741:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
742:       }
743:       if(n   == sz-1){
744:         tmp0  = x[*idx++];
745:         sum1 += *v1++ * tmp0;
746:         sum2 += *v2++ * tmp0;
747:         sum3 += *v3++ * tmp0;
748:         sum4 += *v4++ * tmp0;
749:         sum5 += *v5++ * tmp0;
750:       }
751:       y[row++]=sum1;
752:       y[row++]=sum2;
753:       y[row++]=sum3;
754:       y[row++]=sum4;
755:       y[row++]=sum5;
756:       v1      =v5;       /* Since the next block to be processed starts there */
757:       idx    +=4*sz;
758:       break;
759:     default :
760:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
761:     }
762:   }
763:   VecRestoreArray(xx,&x);
764:   VecRestoreArray(yy,&y);
765:   if (zz != yy) {
766:     VecRestoreArray(zz,&z);
767:   }
768:   PetscLogFlops(2*a->nz);
769:   return(0);
770: }

772: /* ----------------------------------------------------------- */
775: PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx)
776: {
777:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
778:   IS             iscol = a->col,isrow = a->row;
780:   PetscInt       *r,*c,i,j,n = A->rmap.n,*ai = a->i,nz,*a_j = a->j;
781:   PetscInt       node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout;
782:   PetscScalar    *x,*b,*a_a = a->a,*tmp,*tmps,*aa,tmp0,tmp1;
783:   PetscScalar    sum1,sum2,sum3,sum4,sum5,*v1,*v2,*v3,*v4,*v5;

786:   if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix");
787:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
788:   node_max = a->inode.node_count;
789:   ns       = a->inode.size;     /* Node Size array */

791:   VecGetArray(bb,&b);
792:   VecGetArray(xx,&x);
793:   tmp  = a->solve_work;
794: 
795:   ISGetIndices(isrow,&rout); r = rout;
796:   ISGetIndices(iscol,&cout); c = cout + (n-1);
797: 
798:   /* forward solve the lower triangular */
799:   tmps = tmp ;
800:   aa   = a_a ;
801:   aj   = a_j ;
802:   ad   = a->diag;

804:   for (i = 0,row = 0; i< node_max; ++i){
805:     nsz = ns[i];
806:     aii = ai[row];
807:     v1  = aa + aii;
808:     vi  = aj + aii;
809:     nz  = ad[row]- aii;
810: 
811:     switch (nsz){               /* Each loop in 'case' is unrolled */
812:     case 1 :
813:       sum1 = b[*r++];
814:       /*      while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
815:       for(j=0; j<nz-1; j+=2){
816:         i0   = vi[0];
817:         i1   = vi[1];
818:         vi  +=2;
819:         tmp0 = tmps[i0];
820:         tmp1 = tmps[i1];
821:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
822:       }
823:       if(j == nz-1){
824:         tmp0 = tmps[*vi++];
825:         sum1 -= *v1++ *tmp0;
826:       }
827:       tmp[row ++]=sum1;
828:       break;
829:     case 2:
830:       sum1 = b[*r++];
831:       sum2 = b[*r++];
832:       v2   = aa + ai[row+1];

834:       for(j=0; j<nz-1; j+=2){
835:         i0   = vi[0];
836:         i1   = vi[1];
837:         vi  +=2;
838:         tmp0 = tmps[i0];
839:         tmp1 = tmps[i1];
840:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
841:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
842:       }
843:       if(j == nz-1){
844:         tmp0 = tmps[*vi++];
845:         sum1 -= *v1++ *tmp0;
846:         sum2 -= *v2++ *tmp0;
847:       }
848:       sum2 -= *v2++ * sum1;
849:       tmp[row ++]=sum1;
850:       tmp[row ++]=sum2;
851:       break;
852:     case 3:
853:       sum1 = b[*r++];
854:       sum2 = b[*r++];
855:       sum3 = b[*r++];
856:       v2   = aa + ai[row+1];
857:       v3   = aa + ai[row+2];
858: 
859:       for (j=0; j<nz-1; j+=2){
860:         i0   = vi[0];
861:         i1   = vi[1];
862:         vi  +=2;
863:         tmp0 = tmps[i0];
864:         tmp1 = tmps[i1];
865:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
866:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
867:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
868:       }
869:       if (j == nz-1){
870:         tmp0 = tmps[*vi++];
871:         sum1 -= *v1++ *tmp0;
872:         sum2 -= *v2++ *tmp0;
873:         sum3 -= *v3++ *tmp0;
874:       }
875:       sum2 -= *v2++ * sum1;
876:       sum3 -= *v3++ * sum1;
877:       sum3 -= *v3++ * sum2;
878:       tmp[row ++]=sum1;
879:       tmp[row ++]=sum2;
880:       tmp[row ++]=sum3;
881:       break;
882: 
883:     case 4:
884:       sum1 = b[*r++];
885:       sum2 = b[*r++];
886:       sum3 = b[*r++];
887:       sum4 = b[*r++];
888:       v2   = aa + ai[row+1];
889:       v3   = aa + ai[row+2];
890:       v4   = aa + ai[row+3];
891: 
892:       for (j=0; j<nz-1; j+=2){
893:         i0   = vi[0];
894:         i1   = vi[1];
895:         vi  +=2;
896:         tmp0 = tmps[i0];
897:         tmp1 = tmps[i1];
898:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
899:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
900:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
901:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
902:       }
903:       if (j == nz-1){
904:         tmp0 = tmps[*vi++];
905:         sum1 -= *v1++ *tmp0;
906:         sum2 -= *v2++ *tmp0;
907:         sum3 -= *v3++ *tmp0;
908:         sum4 -= *v4++ *tmp0;
909:       }
910:       sum2 -= *v2++ * sum1;
911:       sum3 -= *v3++ * sum1;
912:       sum4 -= *v4++ * sum1;
913:       sum3 -= *v3++ * sum2;
914:       sum4 -= *v4++ * sum2;
915:       sum4 -= *v4++ * sum3;
916: 
917:       tmp[row ++]=sum1;
918:       tmp[row ++]=sum2;
919:       tmp[row ++]=sum3;
920:       tmp[row ++]=sum4;
921:       break;
922:     case 5:
923:       sum1 = b[*r++];
924:       sum2 = b[*r++];
925:       sum3 = b[*r++];
926:       sum4 = b[*r++];
927:       sum5 = b[*r++];
928:       v2   = aa + ai[row+1];
929:       v3   = aa + ai[row+2];
930:       v4   = aa + ai[row+3];
931:       v5   = aa + ai[row+4];
932: 
933:       for (j=0; j<nz-1; j+=2){
934:         i0   = vi[0];
935:         i1   = vi[1];
936:         vi  +=2;
937:         tmp0 = tmps[i0];
938:         tmp1 = tmps[i1];
939:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
940:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
941:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
942:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
943:         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
944:       }
945:       if (j == nz-1){
946:         tmp0 = tmps[*vi++];
947:         sum1 -= *v1++ *tmp0;
948:         sum2 -= *v2++ *tmp0;
949:         sum3 -= *v3++ *tmp0;
950:         sum4 -= *v4++ *tmp0;
951:         sum5 -= *v5++ *tmp0;
952:       }

954:       sum2 -= *v2++ * sum1;
955:       sum3 -= *v3++ * sum1;
956:       sum4 -= *v4++ * sum1;
957:       sum5 -= *v5++ * sum1;
958:       sum3 -= *v3++ * sum2;
959:       sum4 -= *v4++ * sum2;
960:       sum5 -= *v5++ * sum2;
961:       sum4 -= *v4++ * sum3;
962:       sum5 -= *v5++ * sum3;
963:       sum5 -= *v5++ * sum4;
964: 
965:       tmp[row ++]=sum1;
966:       tmp[row ++]=sum2;
967:       tmp[row ++]=sum3;
968:       tmp[row ++]=sum4;
969:       tmp[row ++]=sum5;
970:       break;
971:     default:
972:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
973:     }
974:   }
975:   /* backward solve the upper triangular */
976:   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
977:     nsz = ns[i];
978:     aii = ai[row+1] -1;
979:     v1  = aa + aii;
980:     vi  = aj + aii;
981:     nz  = aii- ad[row];
982:     switch (nsz){               /* Each loop in 'case' is unrolled */
983:     case 1 :
984:       sum1 = tmp[row];

986:       for(j=nz ; j>1; j-=2){
987:         vi  -=2;
988:         i0   = vi[2];
989:         i1   = vi[1];
990:         tmp0 = tmps[i0];
991:         tmp1 = tmps[i1];
992:         v1   -= 2;
993:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
994:       }
995:       if (j==1){
996:         tmp0  = tmps[*vi--];
997:         sum1 -= *v1-- * tmp0;
998:       }
999:       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1000:       break;
1001:     case 2 :
1002:       sum1 = tmp[row];
1003:       sum2 = tmp[row -1];
1004:       v2   = aa + ai[row]-1;
1005:       for (j=nz ; j>1; j-=2){
1006:         vi  -=2;
1007:         i0   = vi[2];
1008:         i1   = vi[1];
1009:         tmp0 = tmps[i0];
1010:         tmp1 = tmps[i1];
1011:         v1   -= 2;
1012:         v2   -= 2;
1013:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1014:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1015:       }
1016:       if (j==1){
1017:         tmp0  = tmps[*vi--];
1018:         sum1 -= *v1-- * tmp0;
1019:         sum2 -= *v2-- * tmp0;
1020:       }
1021: 
1022:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1023:       sum2   -= *v2-- * tmp0;
1024:       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1025:       break;
1026:     case 3 :
1027:       sum1 = tmp[row];
1028:       sum2 = tmp[row -1];
1029:       sum3 = tmp[row -2];
1030:       v2   = aa + ai[row]-1;
1031:       v3   = aa + ai[row -1]-1;
1032:       for (j=nz ; j>1; j-=2){
1033:         vi  -=2;
1034:         i0   = vi[2];
1035:         i1   = vi[1];
1036:         tmp0 = tmps[i0];
1037:         tmp1 = tmps[i1];
1038:         v1   -= 2;
1039:         v2   -= 2;
1040:         v3   -= 2;
1041:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1042:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1043:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1044:       }
1045:       if (j==1){
1046:         tmp0  = tmps[*vi--];
1047:         sum1 -= *v1-- * tmp0;
1048:         sum2 -= *v2-- * tmp0;
1049:         sum3 -= *v3-- * tmp0;
1050:       }
1051:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1052:       sum2   -= *v2-- * tmp0;
1053:       sum3   -= *v3-- * tmp0;
1054:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1055:       sum3   -= *v3-- * tmp0;
1056:       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1057: 
1058:       break;
1059:     case 4 :
1060:       sum1 = tmp[row];
1061:       sum2 = tmp[row -1];
1062:       sum3 = tmp[row -2];
1063:       sum4 = tmp[row -3];
1064:       v2   = aa + ai[row]-1;
1065:       v3   = aa + ai[row -1]-1;
1066:       v4   = aa + ai[row -2]-1;

1068:       for (j=nz ; j>1; j-=2){
1069:         vi  -=2;
1070:         i0   = vi[2];
1071:         i1   = vi[1];
1072:         tmp0 = tmps[i0];
1073:         tmp1 = tmps[i1];
1074:         v1  -= 2;
1075:         v2  -= 2;
1076:         v3  -= 2;
1077:         v4  -= 2;
1078:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1079:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1080:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1081:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1082:       }
1083:       if (j==1){
1084:         tmp0  = tmps[*vi--];
1085:         sum1 -= *v1-- * tmp0;
1086:         sum2 -= *v2-- * tmp0;
1087:         sum3 -= *v3-- * tmp0;
1088:         sum4 -= *v4-- * tmp0;
1089:       }

1091:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1092:       sum2   -= *v2-- * tmp0;
1093:       sum3   -= *v3-- * tmp0;
1094:       sum4   -= *v4-- * tmp0;
1095:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1096:       sum3   -= *v3-- * tmp0;
1097:       sum4   -= *v4-- * tmp0;
1098:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1099:       sum4   -= *v4-- * tmp0;
1100:       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1101:       break;
1102:     case 5 :
1103:       sum1 = tmp[row];
1104:       sum2 = tmp[row -1];
1105:       sum3 = tmp[row -2];
1106:       sum4 = tmp[row -3];
1107:       sum5 = tmp[row -4];
1108:       v2   = aa + ai[row]-1;
1109:       v3   = aa + ai[row -1]-1;
1110:       v4   = aa + ai[row -2]-1;
1111:       v5   = aa + ai[row -3]-1;
1112:       for (j=nz ; j>1; j-=2){
1113:         vi  -= 2;
1114:         i0   = vi[2];
1115:         i1   = vi[1];
1116:         tmp0 = tmps[i0];
1117:         tmp1 = tmps[i1];
1118:         v1   -= 2;
1119:         v2   -= 2;
1120:         v3   -= 2;
1121:         v4   -= 2;
1122:         v5   -= 2;
1123:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1124:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1125:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1126:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1127:         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1128:       }
1129:       if (j==1){
1130:         tmp0  = tmps[*vi--];
1131:         sum1 -= *v1-- * tmp0;
1132:         sum2 -= *v2-- * tmp0;
1133:         sum3 -= *v3-- * tmp0;
1134:         sum4 -= *v4-- * tmp0;
1135:         sum5 -= *v5-- * tmp0;
1136:       }

1138:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1139:       sum2   -= *v2-- * tmp0;
1140:       sum3   -= *v3-- * tmp0;
1141:       sum4   -= *v4-- * tmp0;
1142:       sum5   -= *v5-- * tmp0;
1143:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1144:       sum3   -= *v3-- * tmp0;
1145:       sum4   -= *v4-- * tmp0;
1146:       sum5   -= *v5-- * tmp0;
1147:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1148:       sum4   -= *v4-- * tmp0;
1149:       sum5   -= *v5-- * tmp0;
1150:       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1151:       sum5   -= *v5-- * tmp0;
1152:       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1153:       break;
1154:     default:
1155:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1156:     }
1157:   }
1158:   ISRestoreIndices(isrow,&rout);
1159:   ISRestoreIndices(iscol,&cout);
1160:   VecRestoreArray(bb,&b);
1161:   VecRestoreArray(xx,&x);
1162:   PetscLogFlops(2*a->nz - A->cmap.n);
1163:   return(0);
1164: }

1168: PetscErrorCode MatLUFactorNumeric_Inode(Mat A,MatFactorInfo *info,Mat *B)
1169: {
1170:   Mat            C = *B;
1171:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1172:   IS             iscol = b->col,isrow = b->row,isicol = b->icol;
1174:   PetscInt       *r,*ic,*c,n = A->rmap.n,*bi = b->i;
1175:   PetscInt       *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1176:   PetscInt       *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1177:   PetscInt       *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1178:   PetscScalar    *rtmp1,*rtmp2,*rtmp3,*v1,*v2,*v3,*pc1,*pc2,*pc3,mul1,mul2,mul3;
1179:   PetscScalar    tmp,*ba = b->a,*aa = a->a,*pv;
1180:   PetscReal      rs=0.0;
1181:   LUShift_Ctx    sctx;
1182:   PetscInt       newshift;

1185:   sctx.shift_top  = 0;
1186:   sctx.nshift_max = 0;
1187:   sctx.shift_lo   = 0;
1188:   sctx.shift_hi   = 0;

1190:   /* if both shift schemes are chosen by user, only use info->shiftpd */
1191:   if (info->shiftpd && info->shiftnz) info->shiftnz = 0.0;
1192:   if (info->shiftpd) { /* set sctx.shift_top=max{rs} */
1193:     sctx.shift_top = 0;
1194:     for (i=0; i<n; i++) {
1195:       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1196:       rs    = 0.0;
1197:       ajtmp = aj + ai[i];
1198:       rtmp1 = aa + ai[i];
1199:       nz = ai[i+1] - ai[i];
1200:       for (j=0; j<nz; j++){
1201:         if (*ajtmp != i){
1202:           rs += PetscAbsScalar(*rtmp1++);
1203:         } else {
1204:           rs -= PetscRealPart(*rtmp1++);
1205:         }
1206:         ajtmp++;
1207:       }
1208:       if (rs>sctx.shift_top) sctx.shift_top = rs;
1209:     }
1210:     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1211:     sctx.shift_top *= 1.1;
1212:     sctx.nshift_max = 5;
1213:     sctx.shift_lo   = 0.;
1214:     sctx.shift_hi   = 1.;
1215:   }
1216:   sctx.shift_amount = 0;
1217:   sctx.nshift       = 0;

1219:   ISGetIndices(isrow,&r);
1220:   ISGetIndices(iscol,&c);
1221:   ISGetIndices(isicol,&ic);
1222:   PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);
1223:   PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));
1224:   ics   = ic ;
1225:   rtmp2 = rtmp1 + n;
1226:   rtmp3 = rtmp2 + n;
1227: 
1228:   node_max = a->inode.node_count;
1229:   ns       = a->inode.size ;
1230:   if (!ns){
1231:     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1232:   }

1234:   /* If max inode size > 3, split it into two inodes.*/
1235:   /* also map the inode sizes according to the ordering */
1236:   PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);
1237:   for (i=0,j=0; i<node_max; ++i,++j){
1238:     if (ns[i]>3) {
1239:       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1240:       ++j;
1241:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1242:     } else {
1243:       tmp_vec1[j] = ns[i];
1244:     }
1245:   }
1246:   /* Use the correct node_max */
1247:   node_max = j;

1249:   /* Now reorder the inode info based on mat re-ordering info */
1250:   /* First create a row -> inode_size_array_index map */
1251:   PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);
1252:   PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);
1253:   for (i=0,row=0; i<node_max; i++) {
1254:     nodesz = tmp_vec1[i];
1255:     for (j=0; j<nodesz; j++,row++) {
1256:       nsmap[row] = i;
1257:     }
1258:   }
1259:   /* Using nsmap, create a reordered ns structure */
1260:   for (i=0,j=0; i< node_max; i++) {
1261:     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1262:     tmp_vec2[i]  = nodesz;
1263:     j           += nodesz;
1264:   }
1265:   PetscFree(nsmap);
1266:   PetscFree(tmp_vec1);
1267:   /* Now use the correct ns */
1268:   ns = tmp_vec2;

1270:   do {
1271:     sctx.lushift = PETSC_FALSE;
1272:     /* Now loop over each block-row, and do the factorization */
1273:     for (i=0,row=0; i<node_max; i++) {
1274:       nodesz = ns[i];
1275:       nz     = bi[row+1] - bi[row];
1276:       bjtmp  = bj + bi[row];

1278:       switch (nodesz){
1279:       case 1:
1280:         for  (j=0; j<nz; j++){
1281:           idx        = bjtmp[j];
1282:           rtmp1[idx] = 0.0;
1283:         }
1284: 
1285:         /* load in initial (unfactored row) */
1286:         idx    = r[row];
1287:         nz_tmp = ai[idx+1] - ai[idx];
1288:         ajtmp  = aj + ai[idx];
1289:         v1     = aa + ai[idx];

1291:         for (j=0; j<nz_tmp; j++) {
1292:           idx        = ics[ajtmp[j]];
1293:           rtmp1[idx] = v1[j];
1294:         }
1295:         rtmp1[ics[r[row]]] += sctx.shift_amount;

1297:         prow = *bjtmp++ ;
1298:         while (prow < row) {
1299:           pc1 = rtmp1 + prow;
1300:           if (*pc1 != 0.0){
1301:             pv   = ba + bd[prow];
1302:             pj   = nbj + bd[prow];
1303:             mul1 = *pc1 * *pv++;
1304:             *pc1 = mul1;
1305:             nz_tmp = bi[prow+1] - bd[prow] - 1;
1306:             PetscLogFlops(2*nz_tmp);
1307:             for (j=0; j<nz_tmp; j++) {
1308:               tmp = pv[j];
1309:               idx = pj[j];
1310:               rtmp1[idx] -= mul1 * tmp;
1311:             }
1312:           }
1313:           prow = *bjtmp++ ;
1314:         }
1315:         pj  = bj + bi[row];
1316:         pc1 = ba + bi[row];

1318:         sctx.pv    = rtmp1[row];
1319:         rtmp1[row] = 1.0/rtmp1[row]; /* invert diag */
1320:         rs         = 0.0;
1321:         for (j=0; j<nz; j++) {
1322:           idx    = pj[j];
1323:           pc1[j] = rtmp1[idx]; /* rtmp1 -> ba */
1324:           if (idx != row) rs += PetscAbsScalar(pc1[j]);
1325:         }
1326:         sctx.rs  = rs;
1327:         MatLUCheckShift_inline(info,sctx,row,newshift);
1328:         if (newshift == 1) goto endofwhile;
1329:         break;
1330: 
1331:       case 2:
1332:         for (j=0; j<nz; j++) {
1333:           idx        = bjtmp[j];
1334:           rtmp1[idx] = 0.0;
1335:           rtmp2[idx] = 0.0;
1336:         }
1337: 
1338:         /* load in initial (unfactored row) */
1339:         idx    = r[row];
1340:         nz_tmp = ai[idx+1] - ai[idx];
1341:         ajtmp  = aj + ai[idx];
1342:         v1     = aa + ai[idx];
1343:         v2     = aa + ai[idx+1];
1344:         for (j=0; j<nz_tmp; j++) {
1345:           idx        = ics[ajtmp[j]];
1346:           rtmp1[idx] = v1[j];
1347:           rtmp2[idx] = v2[j];
1348:         }
1349:         rtmp1[ics[r[row]]]   += sctx.shift_amount;
1350:         rtmp2[ics[r[row+1]]] += sctx.shift_amount;

1352:         prow = *bjtmp++ ;
1353:         while (prow < row) {
1354:           pc1 = rtmp1 + prow;
1355:           pc2 = rtmp2 + prow;
1356:           if (*pc1 != 0.0 || *pc2 != 0.0){
1357:             pv   = ba + bd[prow];
1358:             pj   = nbj + bd[prow];
1359:             mul1 = *pc1 * *pv;
1360:             mul2 = *pc2 * *pv;
1361:             ++pv;
1362:             *pc1 = mul1;
1363:             *pc2 = mul2;
1364: 
1365:             nz_tmp = bi[prow+1] - bd[prow] - 1;
1366:             for (j=0; j<nz_tmp; j++) {
1367:               tmp = pv[j];
1368:               idx = pj[j];
1369:               rtmp1[idx] -= mul1 * tmp;
1370:               rtmp2[idx] -= mul2 * tmp;
1371:             }
1372:             PetscLogFlops(4*nz_tmp);
1373:           }
1374:           prow = *bjtmp++ ;
1375:         }

1377:         /* Now take care of diagonal 2x2 block. Note: prow = row here */
1378:         pc1 = rtmp1 + prow;
1379:         pc2 = rtmp2 + prow;

1381:         sctx.pv = *pc1;
1382:         pj      = bj + bi[prow];
1383:         rs      = 0.0;
1384:         for (j=0; j<nz; j++){
1385:           idx = pj[j];
1386:           if (idx != prow) rs += PetscAbsScalar(rtmp1[idx]);
1387:         }
1388:         sctx.rs = rs;
1389:         MatLUCheckShift_inline(info,sctx,row,newshift);
1390:         if (newshift == 1) goto endofwhile;

1392:         if (*pc2 != 0.0){
1393:           pj     = nbj + bd[prow];
1394:           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1395:           *pc2   = mul2;
1396:           nz_tmp = bi[prow+1] - bd[prow] - 1;
1397:           for (j=0; j<nz_tmp; j++) {
1398:             idx = pj[j] ;
1399:             tmp = rtmp1[idx];
1400:             rtmp2[idx] -= mul2 * tmp;
1401:           }
1402:           PetscLogFlops(2*nz_tmp);
1403:         }
1404: 
1405:         pj  = bj + bi[row];
1406:         pc1 = ba + bi[row];
1407:         pc2 = ba + bi[row+1];

1409:         sctx.pv = rtmp2[row+1];
1410:         rs = 0.0;
1411:         rtmp1[row]   = 1.0/rtmp1[row];
1412:         rtmp2[row+1] = 1.0/rtmp2[row+1];
1413:         /* copy row entries from dense representation to sparse */
1414:         for (j=0; j<nz; j++) {
1415:           idx    = pj[j];
1416:           pc1[j] = rtmp1[idx];
1417:           pc2[j] = rtmp2[idx];
1418:           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1419:         }
1420:         sctx.rs = rs;
1421:         MatLUCheckShift_inline(info,sctx,row+1,newshift);
1422:         if (newshift == 1) goto endofwhile;
1423:         break;

1425:       case 3:
1426:         for  (j=0; j<nz; j++) {
1427:           idx        = bjtmp[j];
1428:           rtmp1[idx] = 0.0;
1429:           rtmp2[idx] = 0.0;
1430:           rtmp3[idx] = 0.0;
1431:         }
1432:         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1433:         idx    = r[row];
1434:         nz_tmp = ai[idx+1] - ai[idx];
1435:         ajtmp = aj + ai[idx];
1436:         v1    = aa + ai[idx];
1437:         v2    = aa + ai[idx+1];
1438:         v3    = aa + ai[idx+2];
1439:         for (j=0; j<nz_tmp; j++) {
1440:           idx        = ics[ajtmp[j]];
1441:           rtmp1[idx] = v1[j];
1442:           rtmp2[idx] = v2[j];
1443:           rtmp3[idx] = v3[j];
1444:         }
1445:         rtmp1[ics[r[row]]]   += sctx.shift_amount;
1446:         rtmp2[ics[r[row+1]]] += sctx.shift_amount;
1447:         rtmp3[ics[r[row+2]]] += sctx.shift_amount;

1449:         /* loop over all pivot row blocks above this row block */
1450:         prow = *bjtmp++ ;
1451:         while (prow < row) {
1452:           pc1 = rtmp1 + prow;
1453:           pc2 = rtmp2 + prow;
1454:           pc3 = rtmp3 + prow;
1455:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1456:             pv   = ba  + bd[prow];
1457:             pj   = nbj + bd[prow];
1458:             mul1 = *pc1 * *pv;
1459:             mul2 = *pc2 * *pv;
1460:             mul3 = *pc3 * *pv;
1461:             ++pv;
1462:             *pc1 = mul1;
1463:             *pc2 = mul2;
1464:             *pc3 = mul3;
1465: 
1466:             nz_tmp = bi[prow+1] - bd[prow] - 1;
1467:             /* update this row based on pivot row */
1468:             for (j=0; j<nz_tmp; j++) {
1469:               tmp = pv[j];
1470:               idx = pj[j];
1471:               rtmp1[idx] -= mul1 * tmp;
1472:               rtmp2[idx] -= mul2 * tmp;
1473:               rtmp3[idx] -= mul3 * tmp;
1474:             }
1475:             PetscLogFlops(6*nz_tmp);
1476:           }
1477:           prow = *bjtmp++ ;
1478:         }

1480:         /* Now take care of diagonal 3x3 block in this set of rows */
1481:         /* note: prow = row here */
1482:         pc1 = rtmp1 + prow;
1483:         pc2 = rtmp2 + prow;
1484:         pc3 = rtmp3 + prow;

1486:         sctx.pv = *pc1;
1487:         pj      = bj + bi[prow];
1488:         rs      = 0.0;
1489:         for (j=0; j<nz; j++){
1490:           idx = pj[j];
1491:           if (idx != row) rs += PetscAbsScalar(rtmp1[idx]);
1492:         }
1493:         sctx.rs = rs;
1494:         MatLUCheckShift_inline(info,sctx,row,newshift);
1495:         if (newshift == 1) goto endofwhile;

1497:         if (*pc2 != 0.0 || *pc3 != 0.0){
1498:           mul2 = (*pc2)/(*pc1);
1499:           mul3 = (*pc3)/(*pc1);
1500:           *pc2 = mul2;
1501:           *pc3 = mul3;
1502:           nz_tmp = bi[prow+1] - bd[prow] - 1;
1503:           pj     = nbj + bd[prow];
1504:           for (j=0; j<nz_tmp; j++) {
1505:             idx = pj[j] ;
1506:             tmp = rtmp1[idx];
1507:             rtmp2[idx] -= mul2 * tmp;
1508:             rtmp3[idx] -= mul3 * tmp;
1509:           }
1510:           PetscLogFlops(4*nz_tmp);
1511:         }
1512:         ++prow;

1514:         pc2 = rtmp2 + prow;
1515:         pc3 = rtmp3 + prow;
1516:         sctx.pv = *pc2;
1517:         pj      = bj + bi[prow];
1518:         rs      = 0.0;
1519:         for (j=0; j<nz; j++){
1520:           idx = pj[j];
1521:           if (idx != prow) rs += PetscAbsScalar(rtmp2[idx]);
1522:         }
1523:         sctx.rs = rs;
1524:         MatLUCheckShift_inline(info,sctx,row+1,newshift);
1525:         if (newshift == 1) goto endofwhile;

1527:         if (*pc3 != 0.0){
1528:           mul3   = (*pc3)/(*pc2);
1529:           *pc3   = mul3;
1530:           pj     = nbj + bd[prow];
1531:           nz_tmp = bi[prow+1] - bd[prow] - 1;
1532:           for (j=0; j<nz_tmp; j++) {
1533:             idx = pj[j] ;
1534:             tmp = rtmp2[idx];
1535:             rtmp3[idx] -= mul3 * tmp;
1536:           }
1537:           PetscLogFlops(4*nz_tmp);
1538:         }

1540:         pj  = bj + bi[row];
1541:         pc1 = ba + bi[row];
1542:         pc2 = ba + bi[row+1];
1543:         pc3 = ba + bi[row+2];

1545:         sctx.pv = rtmp3[row+2];
1546:         rs = 0.0;
1547:         rtmp1[row]   = 1.0/rtmp1[row];
1548:         rtmp2[row+1] = 1.0/rtmp2[row+1];
1549:         rtmp3[row+2] = 1.0/rtmp3[row+2];
1550:         /* copy row entries from dense representation to sparse */
1551:         for (j=0; j<nz; j++) {
1552:           idx    = pj[j];
1553:           pc1[j] = rtmp1[idx];
1554:           pc2[j] = rtmp2[idx];
1555:           pc3[j] = rtmp3[idx];
1556:           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
1557:         }

1559:         sctx.rs = rs;
1560:         MatLUCheckShift_inline(info,sctx,row+2,newshift);
1561:         if (newshift == 1) goto endofwhile;
1562:         break;

1564:       default:
1565:         SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1566:       }
1567:       row += nodesz;                 /* Update the row */
1568:     }
1569:     endofwhile:;
1570:   } while (sctx.lushift);
1571:   PetscFree(rtmp1);
1572:   PetscFree(tmp_vec2);
1573:   ISRestoreIndices(isicol,&ic);
1574:   ISRestoreIndices(isrow,&r);
1575:   ISRestoreIndices(iscol,&c);
1576:   C->factor      = FACTOR_LU;
1577:   C->assembled   = PETSC_TRUE;
1578:   if (sctx.nshift) {
1579:     if (info->shiftnz) {
1580:       PetscInfo2(0,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
1581:     } else if (info->shiftpd) {
1582:       PetscInfo4(0,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,info->shift_fraction,sctx.shift_top);
1583:     }
1584:   }
1585:   PetscLogFlops(C->cmap.n);
1586:   return(0);
1587: }

1589: /*
1590:      Makes a longer coloring[] array and calls the usual code with that
1591: */
1594: PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
1595: {
1596:   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
1597:   PetscErrorCode  ierr;
1598:   PetscInt        n = mat->cmap.n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1599:   PetscInt        *colorused,i;
1600:   ISColoringValue *newcolor;

1603:   PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);
1604:   /* loop over inodes, marking a color for each column*/
1605:   row = 0;
1606:   for (i=0; i<m; i++){
1607:     for (j=0; j<ns[i]; j++) {
1608:       newcolor[row++] = coloring[i] + j*ncolors;
1609:     }
1610:   }

1612:   /* eliminate unneeded colors */
1613:   PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);
1614:   PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));
1615:   for (i=0; i<n; i++) {
1616:     colorused[newcolor[i]] = 1;
1617:   }

1619:   for (i=1; i<5*ncolors; i++) {
1620:     colorused[i] += colorused[i-1];
1621:   }
1622:   ncolors = colorused[5*ncolors-1];
1623:   for (i=0; i<n; i++) {
1624:     newcolor[i] = colorused[newcolor[i]]-1;
1625:   }
1626:   PetscFree(colorused);
1627:   ISColoringCreate(mat->comm,ncolors,n,newcolor,iscoloring);
1628:   PetscFree(coloring);
1629:   return(0);
1630: }

1632: /*
1633:     samestructure indicates that the matrix has not changed its nonzero structure so we 
1634:     do not need to recompute the inodes 
1635: */
1638: PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
1639: {
1640:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1642:   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
1643:   PetscTruth     flag;

1646:   if (!a->inode.use)                     return(0);
1647:   if (a->inode.checked && samestructure) return(0);


1650:   m = A->rmap.n;
1651:   if (a->inode.size) {ns = a->inode.size;}
1652:   else {PetscMalloc((m+1)*sizeof(PetscInt),&ns);}

1654:   i          = 0;
1655:   node_count = 0;
1656:   idx        = a->j;
1657:   ii         = a->i;
1658:   while (i < m){                /* For each row */
1659:     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
1660:     /* Limits the number of elements in a node to 'a->inode.limit' */
1661:     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
1662:       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
1663:       if (nzy != nzx) break;
1664:       idy  += nzx;             /* Same nonzero pattern */
1665:       PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
1666:       if (!flag) break;
1667:     }
1668:     ns[node_count++] = blk_size;
1669:     idx += blk_size*nzx;
1670:     i    = j;
1671:   }
1672:   /* If not enough inodes found,, do not use inode version of the routines */
1673:   if (!a->inode.size && m && node_count > 0.9*m) {
1674:     PetscFree(ns);
1675:     a->inode.node_count     = 0;
1676:     a->inode.size           = PETSC_NULL;
1677:     a->inode.use            = PETSC_FALSE;
1678:     PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
1679:   } else {
1680:     A->ops->mult            = MatMult_Inode;
1681:     A->ops->multadd         = MatMultAdd_Inode;
1682:     A->ops->solve           = MatSolve_Inode;
1683:     A->ops->lufactornumeric = MatLUFactorNumeric_Inode;
1684:     A->ops->getrowij        = MatGetRowIJ_Inode;
1685:     A->ops->restorerowij    = MatRestoreRowIJ_Inode;
1686:     A->ops->getcolumnij     = MatGetColumnIJ_Inode;
1687:     A->ops->restorecolumnij = MatRestoreColumnIJ_Inode;
1688:     A->ops->coloringpatch   = MatColoringPatch_Inode;
1689:     a->inode.node_count     = node_count;
1690:     a->inode.size           = ns;
1691:     PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
1692:   }
1693:   return(0);
1694: }

1696: /*
1697:      This is really ugly. if inodes are used this replaces the 
1698:   permutations with ones that correspond to rows/cols of the matrix
1699:   rather then inode blocks
1700: */
1703: PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
1704: {
1705:   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);

1708:   PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);
1709:   if (f) {
1710:     (*f)(A,rperm,cperm);
1711:   }
1712:   return(0);
1713: }

1718: PetscErrorCode  MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm)
1719: {
1720:   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
1722:   PetscInt       m = A->rmap.n,n = A->cmap.n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count;
1723:   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
1724:   PetscInt       nslim_col,*ns_col;
1725:   IS             ris = *rperm,cis = *cperm;

1728:   if (!a->inode.size) return(0); /* no inodes so return */
1729:   if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */

1731:   Mat_CreateColInode(A,&nslim_col,&ns_col);
1732:   PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);
1733:   PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);
1734:   permc = permr + m;

1736:   ISGetIndices(ris,&ridx);
1737:   ISGetIndices(cis,&cidx);

1739:   /* Form the inode structure for the rows of permuted matric using inv perm*/
1740:   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];

1742:   /* Construct the permutations for rows*/
1743:   for (i=0,row = 0; i<nslim_row; ++i){
1744:     indx      = ridx[i];
1745:     start_val = tns[indx];
1746:     end_val   = tns[indx + 1];
1747:     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
1748:   }

1750:   /* Form the inode structure for the columns of permuted matrix using inv perm*/
1751:   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];

1753:  /* Construct permutations for columns */
1754:   for (i=0,col=0; i<nslim_col; ++i){
1755:     indx      = cidx[i];
1756:     start_val = tns[indx];
1757:     end_val   = tns[indx + 1];
1758:     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
1759:   }

1761:   ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);
1762:   ISSetPermutation(*rperm);
1763:   ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);
1764:   ISSetPermutation(*cperm);
1765: 
1766:   ISRestoreIndices(ris,&ridx);
1767:   ISRestoreIndices(cis,&cidx);

1769:   PetscFree(ns_col);
1770:   PetscFree(permr);
1771:   ISDestroy(cis);
1772:   ISDestroy(ris);
1773:   PetscFree(tns);
1774:   return(0);
1775: }

1780: /*@C
1781:    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.

1783:    Collective on Mat

1785:    Input Parameter:
1786: .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ

1788:    Output Parameter:
1789: +  node_count - no of inodes present in the matrix.
1790: .  sizes      - an array of size node_count,with sizes of each inode.
1791: -  limit      - the max size used to generate the inodes.

1793:    Level: advanced

1795:    Notes: This routine returns some internal storage information
1796:    of the matrix, it is intended to be used by advanced users.
1797:    It should be called after the matrix is assembled.
1798:    The contents of the sizes[] array should not be changed.
1799:    PETSC_NULL may be passed for information not requested.

1801: .keywords: matrix, seqaij, get, inode

1803: .seealso: MatGetInfo()
1804: @*/
1805: PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
1806: {
1807:   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);

1810:   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1811:   PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);
1812:   if (f) {
1813:     (*f)(A,node_count,sizes,limit);
1814:   }
1815:   return(0);
1816: }

1821: PetscErrorCode  MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
1822: {
1823:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;

1826:   if (node_count) *node_count = a->inode.node_count;
1827:   if (sizes)      *sizes      = a->inode.size;
1828:   if (limit)      *limit      = a->inode.limit;
1829:   return(0);
1830: }