Actual source code: vpscat.c

  1: #define PETSCVEC_DLL

  3: /*
  4:     Defines parallel vector scatters.
  5: */

 7:  #include include/private/isimpl.h
 8:  #include include/private/vecimpl.h
 9:  #include src/vec/vec/impls/dvecimpl.h
 10:  #include src/vec/vec/impls/mpi/pvecimpl.h
 11:  #include petscsys.h

 15: PetscErrorCode VecScatterView_MPI(VecScatter ctx,PetscViewer viewer)
 16: {
 17:   VecScatter_MPI_General *to=(VecScatter_MPI_General*)ctx->todata;
 18:   VecScatter_MPI_General *from=(VecScatter_MPI_General*)ctx->fromdata;
 19:   PetscErrorCode         ierr;
 20:   PetscInt               i;
 21:   PetscMPIInt            rank;
 22:   PetscViewerFormat      format;
 23:   PetscTruth             iascii;

 26:   PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_ASCII,&iascii);
 27:   if (iascii) {
 28:     MPI_Comm_rank(ctx->comm,&rank);
 29:     PetscViewerGetFormat(viewer,&format);
 30:     if (format ==  PETSC_VIEWER_ASCII_INFO) {
 31:       PetscInt nsend_max,nrecv_max,lensend_max,lenrecv_max,alldata,itmp;

 33:       MPI_Reduce(&to->n,&nsend_max,1,MPIU_INT,MPI_MAX,0,ctx->comm);
 34:       MPI_Reduce(&from->n,&nrecv_max,1,MPIU_INT,MPI_MAX,0,ctx->comm);
 35:       itmp = to->starts[to->n+1];
 36:       MPI_Reduce(&itmp,&lensend_max,1,MPIU_INT,MPI_MAX,0,ctx->comm);
 37:       itmp = from->starts[from->n+1];
 38:       MPI_Reduce(&itmp,&lenrecv_max,1,MPIU_INT,MPI_MAX,0,ctx->comm);
 39:       MPI_Reduce(&itmp,&alldata,1,MPIU_INT,MPI_SUM,0,ctx->comm);

 41:       PetscViewerASCIIPrintf(viewer,"VecScatter statistics\n");
 42:       PetscViewerASCIIPrintf(viewer,"  Maximum number sends %D\n",nsend_max);
 43:       PetscViewerASCIIPrintf(viewer,"  Maximum number receives %D\n",nrecv_max);
 44:       PetscViewerASCIIPrintf(viewer,"  Maximum data sent %D\n",(int)(lensend_max*to->bs*sizeof(PetscScalar)));
 45:       PetscViewerASCIIPrintf(viewer,"  Maximum data received %D\n",(int)(lenrecv_max*to->bs*sizeof(PetscScalar)));
 46:       PetscViewerASCIIPrintf(viewer,"  Total data sent %D\n",(int)(alldata*to->bs*sizeof(PetscScalar)));

 48:     } else {
 49:       PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number sends = %D; Number to self = %D\n",rank,to->n,to->local.n);
 50:       if (to->n) {
 51:         for (i=0; i<to->n; i++){
 52:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]   %D length = %D to whom %D\n",rank,i,to->starts[i+1]-to->starts[i],to->procs[i]);
 53:         }
 54:         PetscViewerASCIISynchronizedPrintf(viewer,"Now the indices for all remote sends (in order by process sent to)\n");
 55:         for (i=0; i<to->starts[to->n]; i++){
 56:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]%D \n",rank,to->indices[i]);
 57:         }
 58:       }

 60:       PetscViewerASCIISynchronizedPrintf(viewer,"[%d]Number receives = %D; Number from self = %D\n",rank,from->n,from->local.n);
 61:       if (from->n) {
 62:         for (i=0; i<from->n; i++){
 63:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D length %D from whom %D\n",rank,i,from->starts[i+1]-from->starts[i],from->procs[i]);
 64:         }

 66:         PetscViewerASCIISynchronizedPrintf(viewer,"Now the indices for all remote receives (in order by process received from)\n");
 67:         for (i=0; i<from->starts[from->n]; i++){
 68:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]%D \n",rank,from->indices[i]);
 69:         }
 70:       }
 71:       if (to->local.n) {
 72:         PetscViewerASCIISynchronizedPrintf(viewer,"[%d]Indices for local part of scatter\n",rank);
 73:         for (i=0; i<to->local.n; i++){
 74:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]From %D to %D \n",rank,from->local.vslots[i],to->local.vslots[i]);
 75:         }
 76:       }

 78:       PetscViewerFlush(viewer);
 79:     }
 80:   } else {
 81:     SETERRQ1(PETSC_ERR_SUP,"Viewer type %s not supported for this scatter",((PetscObject)viewer)->type_name);
 82:   }
 83:   return(0);
 84: }

 86: /* -----------------------------------------------------------------------------------*/
 87: /*
 88:       The next routine determines what part of  the local part of the scatter is an
 89:   exact copy of values into their current location. We check this here and
 90:   then know that we need not perform that portion of the scatter.
 91: */
 94: PetscErrorCode VecScatterLocalOptimize_Private(VecScatter_Seq_General *to,VecScatter_Seq_General *from)
 95: {
 96:   PetscInt       n = to->n,n_nonmatching = 0,i,*to_slots = to->vslots,*from_slots = from->vslots;
 98:   PetscInt       *nto_slots,*nfrom_slots,j = 0;
 99: 
101:   for (i=0; i<n; i++) {
102:     if (to_slots[i] != from_slots[i]) n_nonmatching++;
103:   }

105:   if (!n_nonmatching) {
106:     to->nonmatching_computed = PETSC_TRUE;
107:     to->n_nonmatching        = from->n_nonmatching = 0;
108:     PetscInfo1(0,"Reduced %D to 0\n", n);
109:   } else if (n_nonmatching == n) {
110:     to->nonmatching_computed = PETSC_FALSE;
111:     PetscInfo(0,"All values non-matching\n");
112:   } else {
113:     to->nonmatching_computed= PETSC_TRUE;
114:     to->n_nonmatching       = from->n_nonmatching = n_nonmatching;
115:     PetscMalloc(n_nonmatching*sizeof(PetscInt),&nto_slots);
116:     PetscMalloc(n_nonmatching*sizeof(PetscInt),&nfrom_slots);
117:     to->slots_nonmatching   = nto_slots;
118:     from->slots_nonmatching = nfrom_slots;
119:     for (i=0; i<n; i++) {
120:       if (to_slots[i] != from_slots[i]) {
121:         nto_slots[j]   = to_slots[i];
122:         nfrom_slots[j] = from_slots[i];
123:         j++;
124:       }
125:     }
126:     PetscInfo2(0,"Reduced %D to %D\n",n,n_nonmatching);
127:   }
128:   return(0);
129: }

131: /* --------------------------------------------------------------------------------------*/

133: /* -------------------------------------------------------------------------------------*/
136: PetscErrorCode VecScatterDestroy_PtoP(VecScatter ctx)
137: {
138:   VecScatter_MPI_General *to   = (VecScatter_MPI_General*)ctx->todata;
139:   VecScatter_MPI_General *from = (VecScatter_MPI_General*)ctx->fromdata;
140:   PetscErrorCode         ierr;
141:   PetscInt               i;

144:   if (to->use_readyreceiver) {
145:     /*
146:        Since we have already posted sends we must cancel them before freeing 
147:        the requests
148:     */
149:     for (i=0; i<from->n; i++) {
150:       MPI_Cancel(from->requests+i);
151:     }
152:     for (i=0; i<to->n; i++) {
153:       MPI_Cancel(to->rev_requests+i);
154:     }
155:   }

157: #if defined(PETSC_HAVE_MPI_ALLTOALLW)
158:   if (to->use_alltoallw) {
159:     PetscFree3(to->wcounts,to->wdispls,to->types);
160:     PetscFree3(from->wcounts,from->wdispls,from->types);
161:   }
162: #endif

164: #if defined(PETSC_HAVE_MPI_WINDOW)
165:   if (to->use_window) {
166:     MPI_Win_free(&from->window);
167:     MPI_Win_free(&to->window);
168:   }
169: #endif

171:   if (to->use_alltoallv) {
172:     PetscFree2(to->counts,to->displs);
173:     PetscFree2(from->counts,from->displs);
174:   }

176:   /* release MPI resources obtained with MPI_Send_init() and MPI_Recv_init() */
177:   /* 
178:      IBM's PE version of MPI has a bug where freeing these guys will screw up later
179:      message passing.
180:   */
181: #if !defined(PETSC_HAVE_BROKEN_REQUEST_FREE)
182:   if (!to->use_alltoallv) {   /* currently the to->requests etc are ALWAYS allocated even if not used */
183:     if (to->requests) {
184:       for (i=0; i<to->n; i++) {
185:         MPI_Request_free(to->requests + i);
186:       }
187:     }
188:     if (to->rev_requests) {
189:       for (i=0; i<to->n; i++) {
190:         MPI_Request_free(to->rev_requests + i);
191:       }
192:     }
193:   }
194:   /*
195:       MPICH could not properly cancel requests thus with ready receiver mode we
196:     cannot free the requests. It may be fixed now, if not then put the following 
197:     code inside a if !to->use_readyreceiver) {
198:   */
199:   if (!to->use_alltoallv) {    /* currently the from->requests etc are ALWAYS allocated even if not used */
200:     if (from->requests) {
201:       for (i=0; i<from->n; i++) {
202:         MPI_Request_free(from->requests + i);
203:       }
204:     }

206:     if (from->rev_requests) {
207:       for (i=0; i<from->n; i++) {
208:         MPI_Request_free(from->rev_requests + i);
209:       }
210:     }
211:   }
212: #endif

214:   PetscFree(to->local.vslots);
215:   PetscFree(from->local.vslots);
216:   PetscFree2(to->counts,to->displs);
217:   PetscFree2(from->counts,from->displs);
218:   PetscFree(to->local.slots_nonmatching);
219:   PetscFree(from->local.slots_nonmatching);
220:   PetscFree(to->rev_requests);
221:   PetscFree(from->rev_requests);
222:   PetscFree(to->requests);
223:   PetscFree(from->requests);
224:   PetscFree4(to->values,to->indices,to->starts,to->procs);
225:   PetscFree2(to->sstatus,to->rstatus);
226:   PetscFree4(from->values,from->indices,from->starts,from->procs);
227:   PetscFree(from);
228:   PetscFree(to);
229:   PetscHeaderDestroy(ctx);
230:   return(0);
231: }



235: /* --------------------------------------------------------------------------------------*/
236: /*
237:     Special optimization to see if the local part of the scatter is actually 
238:     a copy. The scatter routines call PetscMemcpy() instead.
239:  
240: */
243: PetscErrorCode VecScatterLocalOptimizeCopy_Private(VecScatter_Seq_General *to,VecScatter_Seq_General *from,PetscInt bs)
244: {
245:   PetscInt       n = to->n,i,*to_slots = to->vslots,*from_slots = from->vslots;
246:   PetscInt       to_start,from_start;

250:   to_start   = to_slots[0];
251:   from_start = from_slots[0];

253:   for (i=1; i<n; i++) {
254:     to_start   += bs;
255:     from_start += bs;
256:     if (to_slots[i]   != to_start)   return(0);
257:     if (from_slots[i] != from_start) return(0);
258:   }
259:   to->is_copy       = PETSC_TRUE;
260:   to->copy_start    = to_slots[0];
261:   to->copy_length   = bs*sizeof(PetscScalar)*n;
262:   from->is_copy     = PETSC_TRUE;
263:   from->copy_start  = from_slots[0];
264:   from->copy_length = bs*sizeof(PetscScalar)*n;
265:   PetscInfo(0,"Local scatter is a copy, optimizing for it\n");
266:   return(0);
267: }

269: /* --------------------------------------------------------------------------------------*/

273: PetscErrorCode VecScatterCopy_PtoP_X(VecScatter in,VecScatter out)
274: {
275:   VecScatter_MPI_General *in_to   = (VecScatter_MPI_General*)in->todata;
276:   VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
277:   PetscErrorCode         ierr;
278:   PetscInt               ny,bs = in_from->bs;

281:   out->begin     = in->begin;
282:   out->end       = in->end;
283:   out->copy      = in->copy;
284:   out->destroy   = in->destroy;
285:   out->view      = in->view;

287:   /* allocate entire send scatter context */
288:   PetscNew(VecScatter_MPI_General,&out_to);
289:   PetscNew(VecScatter_MPI_General,&out_from);

291:   ny                = in_to->starts[in_to->n];
292:   out_to->n         = in_to->n;
293:   out_to->type      = in_to->type;
294:   out_to->sendfirst = in_to->sendfirst;

296:   PetscMalloc(out_to->n*sizeof(MPI_Request),&out_to->requests);
297:   PetscMalloc4(bs*ny,PetscScalar,&out_to->values,ny,PetscInt,&out_to->indices,out_to->n+1,PetscInt,&out_to->starts,
298:                       out_to->n,PetscMPIInt,&out_to->procs);
299:   PetscMalloc2(PetscMax(in_to->n,in_from->n),MPI_Status,&out_to->sstatus,PetscMax(in_to->n,in_from->n),MPI_Status,
300:                       &out_to->rstatus);
301:   PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(PetscInt));
302:   PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(PetscInt));
303:   PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(PetscMPIInt));
304: 
305:   out->todata       = (void*)out_to;
306:   out_to->local.n   = in_to->local.n;
307:   out_to->local.nonmatching_computed = PETSC_FALSE;
308:   out_to->local.n_nonmatching        = 0;
309:   out_to->local.slots_nonmatching    = 0;
310:   if (in_to->local.n) {
311:     PetscMalloc(in_to->local.n*sizeof(PetscInt),&out_to->local.vslots);
312:     PetscMalloc(in_from->local.n*sizeof(PetscInt),&out_from->local.vslots);
313:     PetscMemcpy(out_to->local.vslots,in_to->local.vslots,in_to->local.n*sizeof(PetscInt));
314:     PetscMemcpy(out_from->local.vslots,in_from->local.vslots,in_from->local.n*sizeof(PetscInt));
315:   } else {
316:     out_to->local.vslots   = 0;
317:     out_from->local.vslots = 0;
318:   }

320:   /* allocate entire receive context */
321:   out_from->type      = in_from->type;
322:   ny                  = in_from->starts[in_from->n];
323:   out_from->n         = in_from->n;
324:   out_from->sendfirst = in_from->sendfirst;

326:   PetscMalloc(out_from->n*sizeof(MPI_Request),&out_from->requests);
327:   PetscMalloc4(ny*bs,PetscScalar,&out_from->values,ny,PetscInt,&out_from->indices,
328:                       out_from->n+1,PetscInt,&out_from->starts,out_from->n,PetscMPIInt,&out_from->procs);
329:   PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(PetscInt));
330:   PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(PetscInt));
331:   PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(PetscMPIInt));
332:   out->fromdata       = (void*)out_from;
333:   out_from->local.n   = in_from->local.n;
334:   out_from->local.nonmatching_computed = PETSC_FALSE;
335:   out_from->local.n_nonmatching        = 0;
336:   out_from->local.slots_nonmatching    = 0;

338:   /* 
339:       set up the request arrays for use with isend_init() and irecv_init()
340:   */
341:   {
342:     PetscMPIInt tag;
343:     MPI_Comm    comm;
344:     PetscInt    *sstarts = out_to->starts,  *rstarts = out_from->starts;
345:     PetscMPIInt *sprocs  = out_to->procs,   *rprocs  = out_from->procs;
346:     PetscInt    i;
347:     PetscTruth  flg;
348:     MPI_Request *swaits  = out_to->requests,*rwaits  = out_from->requests;
349:     MPI_Request *rev_swaits,*rev_rwaits;
350:     PetscScalar *Ssvalues = out_to->values, *Srvalues = out_from->values;

352:     PetscMalloc(in_to->n*sizeof(MPI_Request),&out_to->rev_requests);
353:     PetscMalloc(in_from->n*sizeof(MPI_Request),&out_from->rev_requests);

355:     rev_rwaits = out_to->rev_requests;
356:     rev_swaits = out_from->rev_requests;

358:     out_from->bs = out_to->bs = bs;
359:     tag     = out->tag;
360:     comm    = out->comm;

362:     /* Register the receives that you will use later (sends for scatter reverse) */
363:     for (i=0; i<out_from->n; i++) {
364:       MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
365:       MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rev_swaits+i);
366:     }

368:     PetscOptionsHasName(PETSC_NULL,"-vecscatter_rsend",&flg);
369:     if (flg) {
370:       out_to->use_readyreceiver    = PETSC_TRUE;
371:       out_from->use_readyreceiver  = PETSC_TRUE;
372:       for (i=0; i<out_to->n; i++) {
373:         MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
374:       }
375:       if (out_from->n) {MPI_Startall_irecv(out_from->starts[out_from->n]*out_from->bs,out_from->n,out_from->requests);}
376:       MPI_Barrier(comm);
377:       PetscInfo(0,"Using VecScatter ready receiver mode\n");
378:     } else {
379:       out_to->use_readyreceiver    = PETSC_FALSE;
380:       out_from->use_readyreceiver  = PETSC_FALSE;
381:       PetscOptionsHasName(PETSC_NULL,"-vecscatter_ssend",&flg);
382:       if (flg) {
383:         PetscInfo(0,"Using VecScatter Ssend mode\n");
384:       }
385:       for (i=0; i<out_to->n; i++) {
386:         if (!flg) {
387:           MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
388:         } else {
389:           MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
390:         }
391:       }
392:     }
393:     /* Register receives for scatter reverse */
394:     for (i=0; i<out_to->n; i++) {
395:       MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,rev_rwaits+i);
396:     }
397:   }

399:   return(0);
400: }
401: /* --------------------------------------------------------------------------------------------------
402:     Packs and unpacks the message data into send or from receive buffers. 

404:     These could be generated automatically. 

406:     Fortran kernels etc. could be used.
407: */
408: PETSC_STATIC_INLINE void Pack_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
409: {
410:   PetscInt i;
411:   for (i=0; i<n; i++) {
412:     y[i]  = x[indicesx[i]];
413:   }
414: }
415: PETSC_STATIC_INLINE void UnPack_1(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
416: {
417:   PetscInt i;
418:   switch (addv) {
419:   case INSERT_VALUES:
420:     for (i=0; i<n; i++) {
421:       y[indicesy[i]] = x[i];
422:     }
423:     break;
424:   case ADD_VALUES:
425:     for (i=0; i<n; i++) {
426:       y[indicesy[i]] += x[i];
427:     }
428:     break;
429: #if !defined(PETSC_USE_COMPLEX)
430:   case MAX_VALUES:
431:     for (i=0; i<n; i++) {
432:       y[indicesy[i]] = PetscMax(y[indicesy[i]],x[i]);
433:     }
434: #else
435:   case MAX_VALUES:
436: #endif
437:   case NOT_SET_VALUES:
438:     break;
439:   }
440: }

442: PETSC_STATIC_INLINE void Scatter_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
443: {
444:   PetscInt i;
445:   switch (addv) {
446:   case INSERT_VALUES:
447:     for (i=0; i<n; i++) {
448:       y[indicesy[i]] = x[indicesx[i]];
449:     }
450:     break;
451:   case ADD_VALUES:
452:     for (i=0; i<n; i++) {
453:       y[indicesy[i]] += x[indicesx[i]];
454:     }
455:     break;
456: #if !defined(PETSC_USE_COMPLEX)
457:   case MAX_VALUES:
458:     for (i=0; i<n; i++) {
459:       y[indicesy[i]] = PetscMax(y[indicesy[i]],x[indicesx[i]]);
460:     }
461: #else
462:   case MAX_VALUES:
463: #endif
464:   case NOT_SET_VALUES:
465:     break;
466:   }
467: }

469:   /* ----------------------------------------------------------------------------------------------- */
470: PETSC_STATIC_INLINE void Pack_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
471: {
472:   PetscInt i,idx;

474:   for (i=0; i<n; i++) {
475:     idx   = *indicesx++;
476:     y[0]  = x[idx];
477:     y[1]  = x[idx+1];
478:     y    += 2;
479:   }
480: }
481: PETSC_STATIC_INLINE void UnPack_2(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
482: {
483:   PetscInt i,idy;

485:   switch (addv) {
486:   case INSERT_VALUES:
487:     for (i=0; i<n; i++) {
488:       idy       = *indicesy++;
489:       y[idy]    = x[0];
490:       y[idy+1]  = x[1];
491:       x    += 2;
492:     }
493:     break;
494:   case ADD_VALUES:
495:     for (i=0; i<n; i++) {
496:       idy       = *indicesy++;
497:       y[idy]    += x[0];
498:       y[idy+1]  += x[1];
499:       x    += 2;
500:     }
501:     break;
502: #if !defined(PETSC_USE_COMPLEX)
503:   case MAX_VALUES:
504:     for (i=0; i<n; i++) {
505:       idy       = *indicesy++;
506:       y[idy]    = PetscMax(y[idy],x[0]);
507:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
508:       x    += 2;
509:     }
510: #else
511:   case MAX_VALUES:
512: #endif
513:   case NOT_SET_VALUES:
514:     break;
515:   }
516: }

518: PETSC_STATIC_INLINE void Scatter_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
519: {
520:   PetscInt i,idx,idy;

522:   switch (addv) {
523:   case INSERT_VALUES:
524:     for (i=0; i<n; i++) {
525:       idx       = *indicesx++;
526:       idy       = *indicesy++;
527:       y[idy]    = x[idx];
528:       y[idy+1]  = x[idx+1];
529:     }
530:     break;
531:   case ADD_VALUES:
532:     for (i=0; i<n; i++) {
533:       idx       = *indicesx++;
534:       idy       = *indicesy++;
535:       y[idy]    += x[idx];
536:       y[idy+1]  += x[idx+1];
537:     }
538:     break;
539: #if !defined(PETSC_USE_COMPLEX)
540:   case MAX_VALUES:
541:     for (i=0; i<n; i++) {
542:       idx       = *indicesx++;
543:       idy       = *indicesy++;
544:       y[idy]    = PetscMax(y[idy],x[idx]);
545:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
546:     }
547: #else
548:   case MAX_VALUES:
549: #endif
550:   case NOT_SET_VALUES:
551:     break;
552:   }
553: }
554:   /* ----------------------------------------------------------------------------------------------- */
555: PETSC_STATIC_INLINE void Pack_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
556: {
557:   PetscInt i,idx;

559:   for (i=0; i<n; i++) {
560:     idx   = *indicesx++;
561:     y[0]  = x[idx];
562:     y[1]  = x[idx+1];
563:     y[2]  = x[idx+2];
564:     y    += 3;
565:   }
566: }
567: PETSC_STATIC_INLINE void UnPack_3(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
568: {
569:   PetscInt i,idy;

571:   switch (addv) {
572:   case INSERT_VALUES:
573:     for (i=0; i<n; i++) {
574:       idy       = *indicesy++;
575:       y[idy]    = x[0];
576:       y[idy+1]  = x[1];
577:       y[idy+2]  = x[2];
578:       x    += 3;
579:     }
580:     break;
581:   case ADD_VALUES:
582:     for (i=0; i<n; i++) {
583:       idy       = *indicesy++;
584:       y[idy]    += x[0];
585:       y[idy+1]  += x[1];
586:       y[idy+2]  += x[2];
587:       x    += 3;
588:     }
589:     break;
590: #if !defined(PETSC_USE_COMPLEX)
591:   case MAX_VALUES:
592:     for (i=0; i<n; i++) {
593:       idy       = *indicesy++;
594:       y[idy]    = PetscMax(y[idy],x[0]);
595:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
596:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
597:       x    += 3;
598:     }
599: #else
600:   case MAX_VALUES:
601: #endif
602:   case NOT_SET_VALUES:
603:     break;
604:   }
605: }

607: PETSC_STATIC_INLINE void Scatter_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
608: {
609:   PetscInt i,idx,idy;

611:   switch (addv) {
612:   case INSERT_VALUES:
613:     for (i=0; i<n; i++) {
614:       idx       = *indicesx++;
615:       idy       = *indicesy++;
616:       y[idy]    = x[idx];
617:       y[idy+1]  = x[idx+1];
618:       y[idy+2]  = x[idx+2];
619:     }
620:     break;
621:   case ADD_VALUES:
622:     for (i=0; i<n; i++) {
623:       idx       = *indicesx++;
624:       idy       = *indicesy++;
625:       y[idy]    += x[idx];
626:       y[idy+1]  += x[idx+1];
627:       y[idy+2]  += x[idx+2];
628:     }
629:     break;
630: #if !defined(PETSC_USE_COMPLEX)
631:   case MAX_VALUES:
632:     for (i=0; i<n; i++) {
633:       idx       = *indicesx++;
634:       idy       = *indicesy++;
635:       y[idy]    = PetscMax(y[idy],x[idx]);
636:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
637:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
638:     }
639: #else
640:   case MAX_VALUES:
641: #endif
642:   case NOT_SET_VALUES:
643:     break;
644:   }
645: }
646:   /* ----------------------------------------------------------------------------------------------- */
647: PETSC_STATIC_INLINE void Pack_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
648: {
649:   PetscInt i,idx;

651:   for (i=0; i<n; i++) {
652:     idx   = *indicesx++;
653:     y[0]  = x[idx];
654:     y[1]  = x[idx+1];
655:     y[2]  = x[idx+2];
656:     y[3]  = x[idx+3];
657:     y    += 4;
658:   }
659: }
660: PETSC_STATIC_INLINE void UnPack_4(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
661: {
662:   PetscInt i,idy;

664:   switch (addv) {
665:   case INSERT_VALUES:
666:     for (i=0; i<n; i++) {
667:       idy       = *indicesy++;
668:       y[idy]    = x[0];
669:       y[idy+1]  = x[1];
670:       y[idy+2]  = x[2];
671:       y[idy+3]  = x[3];
672:       x    += 4;
673:     }
674:     break;
675:   case ADD_VALUES:
676:     for (i=0; i<n; i++) {
677:       idy       = *indicesy++;
678:       y[idy]    += x[0];
679:       y[idy+1]  += x[1];
680:       y[idy+2]  += x[2];
681:       y[idy+3]  += x[3];
682:       x    += 4;
683:     }
684:     break;
685: #if !defined(PETSC_USE_COMPLEX)
686:   case MAX_VALUES:
687:     for (i=0; i<n; i++) {
688:       idy       = *indicesy++;
689:       y[idy]    = PetscMax(y[idy],x[0]);
690:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
691:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
692:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
693:       x    += 4;
694:     }
695: #else
696:   case MAX_VALUES:
697: #endif
698:   case NOT_SET_VALUES:
699:     break;
700:   }
701: }

703: PETSC_STATIC_INLINE void Scatter_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
704: {
705:   PetscInt i,idx,idy;

707:   switch (addv) {
708:   case INSERT_VALUES:
709:     for (i=0; i<n; i++) {
710:       idx       = *indicesx++;
711:       idy       = *indicesy++;
712:       y[idy]    = x[idx];
713:       y[idy+1]  = x[idx+1];
714:       y[idy+2]  = x[idx+2];
715:       y[idy+3]  = x[idx+3];
716:     }
717:     break;
718:   case ADD_VALUES:
719:     for (i=0; i<n; i++) {
720:       idx       = *indicesx++;
721:       idy       = *indicesy++;
722:       y[idy]    += x[idx];
723:       y[idy+1]  += x[idx+1];
724:       y[idy+2]  += x[idx+2];
725:       y[idy+3]  += x[idx+3];
726:     }
727:     break;
728: #if !defined(PETSC_USE_COMPLEX)
729:   case MAX_VALUES:
730:     for (i=0; i<n; i++) {
731:       idx       = *indicesx++;
732:       idy       = *indicesy++;
733:       y[idy]    = PetscMax(y[idy],x[idx]);
734:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
735:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
736:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
737:     }
738: #else
739:   case MAX_VALUES:
740: #endif
741:   case NOT_SET_VALUES:
742:     break;
743:   }
744: }
745:   /* ----------------------------------------------------------------------------------------------- */
746: PETSC_STATIC_INLINE void Pack_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
747: {
748:   PetscInt i,idx;

750:   for (i=0; i<n; i++) {
751:     idx   = *indicesx++;
752:     y[0]  = x[idx];
753:     y[1]  = x[idx+1];
754:     y[2]  = x[idx+2];
755:     y[3]  = x[idx+3];
756:     y[4]  = x[idx+4];
757:     y    += 5;
758:   }
759: }
760: PETSC_STATIC_INLINE void UnPack_5(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
761: {
762:   PetscInt i,idy;

764:   switch (addv) {
765:   case INSERT_VALUES:
766:     for (i=0; i<n; i++) {
767:       idy       = *indicesy++;
768:       y[idy]    = x[0];
769:       y[idy+1]  = x[1];
770:       y[idy+2]  = x[2];
771:       y[idy+3]  = x[3];
772:       y[idy+4]  = x[4];
773:       x    += 5;
774:     }
775:     break;
776:   case ADD_VALUES:
777:     for (i=0; i<n; i++) {
778:       idy       = *indicesy++;
779:       y[idy]    += x[0];
780:       y[idy+1]  += x[1];
781:       y[idy+2]  += x[2];
782:       y[idy+3]  += x[3];
783:       y[idy+4]  += x[4];
784:       x    += 5;
785:     }
786:     break;
787: #if !defined(PETSC_USE_COMPLEX)
788:   case MAX_VALUES:
789:     for (i=0; i<n; i++) {
790:       idy       = *indicesy++;
791:       y[idy]    = PetscMax(y[idy],x[0]);
792:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
793:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
794:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
795:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
796:       x    += 5;
797:     }
798: #else
799:   case MAX_VALUES:
800: #endif
801:   case NOT_SET_VALUES:
802:     break;
803:   }
804: }

806: PETSC_STATIC_INLINE void Scatter_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
807: {
808:   PetscInt i,idx,idy;

810:   switch (addv) {
811:   case INSERT_VALUES:
812:     for (i=0; i<n; i++) {
813:       idx       = *indicesx++;
814:       idy       = *indicesy++;
815:       y[idy]    = x[idx];
816:       y[idy+1]  = x[idx+1];
817:       y[idy+2]  = x[idx+2];
818:       y[idy+3]  = x[idx+3];
819:       y[idy+4]  = x[idx+4];
820:     }
821:     break;
822:   case ADD_VALUES:
823:     for (i=0; i<n; i++) {
824:       idx       = *indicesx++;
825:       idy       = *indicesy++;
826:       y[idy]    += x[idx];
827:       y[idy+1]  += x[idx+1];
828:       y[idy+2]  += x[idx+2];
829:       y[idy+3]  += x[idx+3];
830:       y[idy+4]  += x[idx+4];
831:     }
832:     break;
833: #if !defined(PETSC_USE_COMPLEX)
834:   case MAX_VALUES:
835:     for (i=0; i<n; i++) {
836:       idx       = *indicesx++;
837:       idy       = *indicesy++;
838:       y[idy]    = PetscMax(y[idy],x[idx]);
839:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
840:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
841:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
842:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
843:     }
844: #else
845:   case MAX_VALUES:
846: #endif
847:   case NOT_SET_VALUES:
848:     break;
849:   }
850: }
851:   /* ----------------------------------------------------------------------------------------------- */
852: PETSC_STATIC_INLINE void Pack_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
853: {
854:   PetscInt i,idx;

856:   for (i=0; i<n; i++) {
857:     idx   = *indicesx++;
858:     y[0]  = x[idx];
859:     y[1]  = x[idx+1];
860:     y[2]  = x[idx+2];
861:     y[3]  = x[idx+3];
862:     y[4]  = x[idx+4];
863:     y[5]  = x[idx+5];
864:     y    += 6;
865:   }
866: }
867: PETSC_STATIC_INLINE void UnPack_6(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
868: {
869:   PetscInt i,idy;

871:   switch (addv) {
872:   case INSERT_VALUES:
873:     for (i=0; i<n; i++) {
874:       idy       = *indicesy++;
875:       y[idy]    = x[0];
876:       y[idy+1]  = x[1];
877:       y[idy+2]  = x[2];
878:       y[idy+3]  = x[3];
879:       y[idy+4]  = x[4];
880:       y[idy+5]  = x[5];
881:       x    += 6;
882:     }
883:     break;
884:   case ADD_VALUES:
885:     for (i=0; i<n; i++) {
886:       idy       = *indicesy++;
887:       y[idy]    += x[0];
888:       y[idy+1]  += x[1];
889:       y[idy+2]  += x[2];
890:       y[idy+3]  += x[3];
891:       y[idy+4]  += x[4];
892:       y[idy+5]  += x[5];
893:       x    += 6;
894:     }
895:     break;
896: #if !defined(PETSC_USE_COMPLEX)
897:   case MAX_VALUES:
898:     for (i=0; i<n; i++) {
899:       idy       = *indicesy++;
900:       y[idy]    = PetscMax(y[idy],x[0]);
901:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
902:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
903:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
904:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
905:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
906:       x    += 6;
907:     }
908: #else
909:   case MAX_VALUES:
910: #endif
911:   case NOT_SET_VALUES:
912:     break;
913:   }
914: }

916: PETSC_STATIC_INLINE void Scatter_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
917: {
918:   PetscInt i,idx,idy;

920:   switch (addv) {
921:   case INSERT_VALUES:
922:     for (i=0; i<n; i++) {
923:       idx       = *indicesx++;
924:       idy       = *indicesy++;
925:       y[idy]    = x[idx];
926:       y[idy+1]  = x[idx+1];
927:       y[idy+2]  = x[idx+2];
928:       y[idy+3]  = x[idx+3];
929:       y[idy+4]  = x[idx+4];
930:       y[idy+5]  = x[idx+5];
931:     }
932:     break;
933:   case ADD_VALUES:
934:     for (i=0; i<n; i++) {
935:       idx       = *indicesx++;
936:       idy       = *indicesy++;
937:       y[idy]    += x[idx];
938:       y[idy+1]  += x[idx+1];
939:       y[idy+2]  += x[idx+2];
940:       y[idy+3]  += x[idx+3];
941:       y[idy+4]  += x[idx+4];
942:       y[idy+5]  += x[idx+5];
943:     }
944:     break;
945: #if !defined(PETSC_USE_COMPLEX)
946:   case MAX_VALUES:
947:     for (i=0; i<n; i++) {
948:       idx       = *indicesx++;
949:       idy       = *indicesy++;
950:       y[idy]    = PetscMax(y[idy],x[idx]);
951:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
952:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
953:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
954:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
955:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
956:     }
957: #else
958:   case MAX_VALUES:
959: #endif
960:   case NOT_SET_VALUES:
961:     break;
962:   }
963: }
964:   /* ----------------------------------------------------------------------------------------------- */
965: PETSC_STATIC_INLINE void Pack_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
966: {
967:   PetscInt i,idx;

969:   for (i=0; i<n; i++) {
970:     idx   = *indicesx++;
971:     y[0]  = x[idx];
972:     y[1]  = x[idx+1];
973:     y[2]  = x[idx+2];
974:     y[3]  = x[idx+3];
975:     y[4]  = x[idx+4];
976:     y[5]  = x[idx+5];
977:     y[6]  = x[idx+6];
978:     y    += 7;
979:   }
980: }
981: PETSC_STATIC_INLINE void UnPack_7(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
982: {
983:   PetscInt i,idy;

985:   switch (addv) {
986:   case INSERT_VALUES:
987:     for (i=0; i<n; i++) {
988:       idy       = *indicesy++;
989:       y[idy]    = x[0];
990:       y[idy+1]  = x[1];
991:       y[idy+2]  = x[2];
992:       y[idy+3]  = x[3];
993:       y[idy+4]  = x[4];
994:       y[idy+5]  = x[5];
995:       y[idy+6]  = x[6];
996:       x    += 7;
997:     }
998:     break;
999:   case ADD_VALUES:
1000:     for (i=0; i<n; i++) {
1001:       idy       = *indicesy++;
1002:       y[idy]    += x[0];
1003:       y[idy+1]  += x[1];
1004:       y[idy+2]  += x[2];
1005:       y[idy+3]  += x[3];
1006:       y[idy+4]  += x[4];
1007:       y[idy+5]  += x[5];
1008:       y[idy+6]  += x[6];
1009:       x    += 7;
1010:     }
1011:     break;
1012: #if !defined(PETSC_USE_COMPLEX)
1013:   case MAX_VALUES:
1014:     for (i=0; i<n; i++) {
1015:       idy       = *indicesy++;
1016:       y[idy]    = PetscMax(y[idy],x[0]);
1017:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
1018:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
1019:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
1020:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
1021:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
1022:       y[idy+6]  = PetscMax(y[idy+6],x[6]);
1023:       x    += 7;
1024:     }
1025: #else
1026:   case MAX_VALUES:
1027: #endif
1028:   case NOT_SET_VALUES:
1029:     break;
1030:   }
1031: }

1033: PETSC_STATIC_INLINE void Scatter_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1034: {
1035:   PetscInt i,idx,idy;

1037:   switch (addv) {
1038:   case INSERT_VALUES:
1039:     for (i=0; i<n; i++) {
1040:       idx       = *indicesx++;
1041:       idy       = *indicesy++;
1042:       y[idy]    = x[idx];
1043:       y[idy+1]  = x[idx+1];
1044:       y[idy+2]  = x[idx+2];
1045:       y[idy+3]  = x[idx+3];
1046:       y[idy+4]  = x[idx+4];
1047:       y[idy+5]  = x[idx+5];
1048:       y[idy+6]  = x[idx+6];
1049:     }
1050:     break;
1051:   case ADD_VALUES:
1052:     for (i=0; i<n; i++) {
1053:       idx       = *indicesx++;
1054:       idy       = *indicesy++;
1055:       y[idy]    += x[idx];
1056:       y[idy+1]  += x[idx+1];
1057:       y[idy+2]  += x[idx+2];
1058:       y[idy+3]  += x[idx+3];
1059:       y[idy+4]  += x[idx+4];
1060:       y[idy+5]  += x[idx+5];
1061:       y[idy+6]  += x[idx+6];
1062:     }
1063:     break;
1064: #if !defined(PETSC_USE_COMPLEX)
1065:   case MAX_VALUES:
1066:     for (i=0; i<n; i++) {
1067:       idx       = *indicesx++;
1068:       idy       = *indicesy++;
1069:       y[idy]    = PetscMax(y[idy],x[idx]);
1070:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1071:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1072:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1073:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1074:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1075:       y[idy+6]  = PetscMax(y[idy+6],x[idx+6]);
1076:     }
1077: #else
1078:   case MAX_VALUES:
1079: #endif
1080:   case NOT_SET_VALUES:
1081:     break;
1082:   }
1083: }
1084:   /* ----------------------------------------------------------------------------------------------- */
1085: PETSC_STATIC_INLINE void Pack_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
1086: {
1087:   PetscInt i,idx;

1089:   for (i=0; i<n; i++) {
1090:     idx   = *indicesx++;
1091:     y[0]  = x[idx];
1092:     y[1]  = x[idx+1];
1093:     y[2]  = x[idx+2];
1094:     y[3]  = x[idx+3];
1095:     y[4]  = x[idx+4];
1096:     y[5]  = x[idx+5];
1097:     y[6]  = x[idx+6];
1098:     y[7]  = x[idx+7];
1099:     y    += 8;
1100:   }
1101: }
1102: PETSC_STATIC_INLINE void UnPack_8(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1103: {
1104:   PetscInt i,idy;

1106:   switch (addv) {
1107:   case INSERT_VALUES:
1108:     for (i=0; i<n; i++) {
1109:       idy       = *indicesy++;
1110:       y[idy]    = x[0];
1111:       y[idy+1]  = x[1];
1112:       y[idy+2]  = x[2];
1113:       y[idy+3]  = x[3];
1114:       y[idy+4]  = x[4];
1115:       y[idy+5]  = x[5];
1116:       y[idy+6]  = x[6];
1117:       y[idy+7]  = x[7];
1118:       x    += 8;
1119:     }
1120:     break;
1121:   case ADD_VALUES:
1122:     for (i=0; i<n; i++) {
1123:       idy       = *indicesy++;
1124:       y[idy]    += x[0];
1125:       y[idy+1]  += x[1];
1126:       y[idy+2]  += x[2];
1127:       y[idy+3]  += x[3];
1128:       y[idy+4]  += x[4];
1129:       y[idy+5]  += x[5];
1130:       y[idy+6]  += x[6];
1131:       y[idy+7]  += x[7];
1132:       x    += 8;
1133:     }
1134:     break;
1135: #if !defined(PETSC_USE_COMPLEX)
1136:   case MAX_VALUES:
1137:     for (i=0; i<n; i++) {
1138:       idy       = *indicesy++;
1139:       y[idy]    = PetscMax(y[idy],x[0]);
1140:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
1141:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
1142:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
1143:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
1144:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
1145:       y[idy+6]  = PetscMax(y[idy+6],x[6]);
1146:       y[idy+7]  = PetscMax(y[idy+7],x[7]);
1147:       x    += 8;
1148:     }
1149: #else
1150:   case MAX_VALUES:
1151: #endif
1152:   case NOT_SET_VALUES:
1153:     break;
1154:   }
1155: }

1157: PETSC_STATIC_INLINE void Scatter_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1158: {
1159:   PetscInt i,idx,idy;

1161:   switch (addv) {
1162:   case INSERT_VALUES:
1163:     for (i=0; i<n; i++) {
1164:       idx       = *indicesx++;
1165:       idy       = *indicesy++;
1166:       y[idy]    = x[idx];
1167:       y[idy+1]  = x[idx+1];
1168:       y[idy+2]  = x[idx+2];
1169:       y[idy+3]  = x[idx+3];
1170:       y[idy+4]  = x[idx+4];
1171:       y[idy+5]  = x[idx+5];
1172:       y[idy+6]  = x[idx+6];
1173:       y[idy+7]  = x[idx+7];
1174:     }
1175:     break;
1176:   case ADD_VALUES:
1177:     for (i=0; i<n; i++) {
1178:       idx       = *indicesx++;
1179:       idy       = *indicesy++;
1180:       y[idy]    += x[idx];
1181:       y[idy+1]  += x[idx+1];
1182:       y[idy+2]  += x[idx+2];
1183:       y[idy+3]  += x[idx+3];
1184:       y[idy+4]  += x[idx+4];
1185:       y[idy+5]  += x[idx+5];
1186:       y[idy+6]  += x[idx+6];
1187:       y[idy+7]  += x[idx+7];
1188:     }
1189:     break;
1190: #if !defined(PETSC_USE_COMPLEX)
1191:   case MAX_VALUES:
1192:     for (i=0; i<n; i++) {
1193:       idx       = *indicesx++;
1194:       idy       = *indicesy++;
1195:       y[idy]    = PetscMax(y[idy],x[idx]);
1196:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1197:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1198:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1199:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1200:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1201:       y[idy+6]  = PetscMax(y[idy+6],x[idx+6]);
1202:       y[idy+7]  = PetscMax(y[idy+7],x[idx+7]);
1203:     }
1204: #else
1205:   case MAX_VALUES:
1206: #endif
1207:   case NOT_SET_VALUES:
1208:     break;
1209:   }
1210: }

1212:   /* ----------------------------------------------------------------------------------------------- */
1213: PETSC_STATIC_INLINE void Pack_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
1214: {
1215:   PetscInt i,idx;

1217:   for (i=0; i<n; i++) {
1218:     idx   = *indicesx++;
1219:     y[0]  = x[idx];
1220:     y[1]  = x[idx+1];
1221:     y[2]  = x[idx+2];
1222:     y[3]  = x[idx+3];
1223:     y[4]  = x[idx+4];
1224:     y[5]  = x[idx+5];
1225:     y[6]  = x[idx+6];
1226:     y[7]  = x[idx+7];
1227:     y[8]  = x[idx+8];
1228:     y[9]  = x[idx+9];
1229:     y[10] = x[idx+10];
1230:     y[11] = x[idx+11];
1231:     y    += 12;
1232:   }
1233: }
1234: PETSC_STATIC_INLINE void UnPack_12(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1235: {
1236:   PetscInt i,idy;

1238:   switch (addv) {
1239:   case INSERT_VALUES:
1240:     for (i=0; i<n; i++) {
1241:       idy       = *indicesy++;
1242:       y[idy]    = x[0];
1243:       y[idy+1]  = x[1];
1244:       y[idy+2]  = x[2];
1245:       y[idy+3]  = x[3];
1246:       y[idy+4]  = x[4];
1247:       y[idy+5]  = x[5];
1248:       y[idy+6]  = x[6];
1249:       y[idy+7]  = x[7];
1250:       y[idy+8]  = x[8];
1251:       y[idy+9]  = x[9];
1252:       y[idy+10] = x[10];
1253:       y[idy+11] = x[11];
1254:       x    += 12;
1255:     }
1256:     break;
1257:   case ADD_VALUES:
1258:     for (i=0; i<n; i++) {
1259:       idy       = *indicesy++;
1260:       y[idy]    += x[0];
1261:       y[idy+1]  += x[1];
1262:       y[idy+2]  += x[2];
1263:       y[idy+3]  += x[3];
1264:       y[idy+4]  += x[4];
1265:       y[idy+5]  += x[5];
1266:       y[idy+6]  += x[6];
1267:       y[idy+7]  += x[7];
1268:       y[idy+8]  += x[8];
1269:       y[idy+9]  += x[9];
1270:       y[idy+10] += x[10];
1271:       y[idy+11] += x[11];
1272:       x    += 12;
1273:     }
1274:     break;
1275: #if !defined(PETSC_USE_COMPLEX)
1276:   case MAX_VALUES:
1277:     for (i=0; i<n; i++) {
1278:       idy       = *indicesy++;
1279:       y[idy]    = PetscMax(y[idy],x[0]);
1280:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
1281:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
1282:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
1283:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
1284:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
1285:       y[idy+6]  = PetscMax(y[idy+6],x[6]);
1286:       y[idy+7]  = PetscMax(y[idy+7],x[7]);
1287:       y[idy+8]  = PetscMax(y[idy+8],x[8]);
1288:       y[idy+9]  = PetscMax(y[idy+9],x[9]);
1289:       y[idy+10] = PetscMax(y[idy+10],x[10]);
1290:       y[idy+11] = PetscMax(y[idy+11],x[11]);
1291:       x    += 12;
1292:     }
1293: #else
1294:   case MAX_VALUES:
1295: #endif
1296:   case NOT_SET_VALUES:
1297:     break;
1298:   }
1299: }

1301: PETSC_STATIC_INLINE void Scatter_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1302: {
1303:   PetscInt i,idx,idy;

1305:   switch (addv) {
1306:   case INSERT_VALUES:
1307:     for (i=0; i<n; i++) {
1308:       idx       = *indicesx++;
1309:       idy       = *indicesy++;
1310:       y[idy]    = x[idx];
1311:       y[idy+1]  = x[idx+1];
1312:       y[idy+2]  = x[idx+2];
1313:       y[idy+3]  = x[idx+3];
1314:       y[idy+4]  = x[idx+4];
1315:       y[idy+5]  = x[idx+5];
1316:       y[idy+6]  = x[idx+6];
1317:       y[idy+7]  = x[idx+7];
1318:       y[idy+8]  = x[idx+8];
1319:       y[idy+9]  = x[idx+9];
1320:       y[idy+10] = x[idx+10];
1321:       y[idy+11] = x[idx+11];
1322:     }
1323:     break;
1324:   case ADD_VALUES:
1325:     for (i=0; i<n; i++) {
1326:       idx       = *indicesx++;
1327:       idy       = *indicesy++;
1328:       y[idy]    += x[idx];
1329:       y[idy+1]  += x[idx+1];
1330:       y[idy+2]  += x[idx+2];
1331:       y[idy+3]  += x[idx+3];
1332:       y[idy+4]  += x[idx+4];
1333:       y[idy+5]  += x[idx+5];
1334:       y[idy+6]  += x[idx+6];
1335:       y[idy+7]  += x[idx+7];
1336:       y[idy+8]  += x[idx+8];
1337:       y[idy+9]  += x[idx+9];
1338:       y[idy+10] += x[idx+10];
1339:       y[idy+11] += x[idx+11];
1340:     }
1341:     break;
1342: #if !defined(PETSC_USE_COMPLEX)
1343:   case MAX_VALUES:
1344:     for (i=0; i<n; i++) {
1345:       idx       = *indicesx++;
1346:       idy       = *indicesy++;
1347:       y[idy]    = PetscMax(y[idy],x[idx]);
1348:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1349:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1350:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1351:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1352:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1353:       y[idy+6]  = PetscMax(y[idy+6],x[idx+6]);
1354:       y[idy+7]  = PetscMax(y[idy+7],x[idx+7]);
1355:       y[idy+8]  = PetscMax(y[idy+8],x[idx+8]);
1356:       y[idy+9]  = PetscMax(y[idy+9],x[idx+9]);
1357:       y[idy+10] = PetscMax(y[idy+10],x[idx+10]);
1358:       y[idy+11] = PetscMax(y[idy+11],x[idx+11]);
1359:     }
1360: #else
1361:   case MAX_VALUES:
1362: #endif
1363:   case NOT_SET_VALUES:
1364:     break;
1365:   }
1366: }

1368: /* Create the VecScatterBegin/End_P for our chosen block sizes */
1369: #define BS 1
1370:  #include src/vec/vec/utils/vpscat.h
1371: #define BS 2
1372:  #include src/vec/vec/utils/vpscat.h
1373: #define BS 3
1374:  #include src/vec/vec/utils/vpscat.h
1375: #define BS 4
1376:  #include src/vec/vec/utils/vpscat.h
1377: #define BS 5
1378:  #include src/vec/vec/utils/vpscat.h
1379: #define BS 6
1380:  #include src/vec/vec/utils/vpscat.h
1381: #define BS 7
1382:  #include src/vec/vec/utils/vpscat.h
1383: #define BS 8
1384:  #include src/vec/vec/utils/vpscat.h
1385: #define BS 12
1386:  #include src/vec/vec/utils/vpscat.h

1388: /* ==========================================================================================*/

1390: /*              create parallel to sequential scatter context                           */

1392: PetscErrorCode VecScatterCreateCommon_PtoS(VecScatter_MPI_General *,VecScatter_MPI_General *,VecScatter);

1396: /*
1397:      sendSizes[] and recvSizes[] cannot have any 0 entries. If you want to support having 0 entries you need
1398:    to change the code below to "compress out" the sendProcs[] and recvProcs[] entries that have 0 entries.

1400:     Probably does not handle sends to self properly. It should remove those from the counts that are used
1401:     in allocating space inside of the from struct
1402: */
1403: PetscErrorCode VecScatterCreateLocal_PtoS(PetscInt nsends,const PetscInt sendSizes[],const PetscInt sendProcs[],const PetscInt sendIdx[],PetscInt nrecvs,const PetscInt recvSizes[],const PetscInt recvProcs[],const PetscInt recvIdx[],PetscInt bs,VecScatter ctx)
1404: {
1405:   VecScatter_MPI_General *from, *to;
1406:   PetscInt       sendSize, recvSize;
1407:   PetscInt       n, i;

1410:   /* allocate entire send scatter context */
1411:   PetscNew(VecScatter_MPI_General,&to);
1412:   to->n = nsends;
1413:   for(n = 0, sendSize = 0; n < to->n; n++) {sendSize += sendSizes[n];}
1414:   PetscMalloc(to->n*sizeof(MPI_Request),&to->requests);
1415:   PetscMalloc4(bs*sendSize,PetscScalar,&to->values,
1416:                       sendSize,PetscInt,&to->indices,
1417:                       to->n+1,PetscInt,&to->starts,
1418:                       to->n,PetscMPIInt,&to->procs);
1419:   PetscMalloc2(PetscMax(to->n,nrecvs),MPI_Status,&to->sstatus,
1420:                       PetscMax(to->n,nrecvs),MPI_Status,&to->rstatus);
1421:   to->starts[0] = 0;
1422:   for(n = 0; n < to->n; n++) {
1423:     if (sendSizes[n] <=0 ) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,"sendSizes[n=%D] = %D cannot be less than 1",n,sendSizes[n]);
1424:     to->starts[n+1] = to->starts[n] + sendSizes[n];
1425:     to->procs[n] = sendProcs[n];
1426:     for(i = to->starts[n]; i < to->starts[n]+sendSizes[n]; i++) {
1427:       to->indices[i] = sendIdx[i];
1428:     }
1429:   }
1430:   ctx->todata = (void *) to;

1432:   /* allocate entire receive scatter context */
1433:   PetscNew(VecScatter_MPI_General,&from);
1434:   from->n = nrecvs;
1435:   for(n = 0, recvSize = 0; n < from->n; n++) {recvSize += recvSizes[n];}
1436:   PetscMalloc(from->n*sizeof(MPI_Request),&from->requests);
1437:   PetscMalloc4(bs*recvSize,PetscScalar,&from->values,
1438:                       recvSize,PetscInt,&from->indices,
1439:                       from->n+1,PetscInt,&from->starts,
1440:                       from->n,PetscMPIInt,&from->procs);
1441:   from->starts[0] = 0;
1442:   for(n = 0; n < from->n; n++) {
1443:     if (recvSizes[n] <=0 ) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,"recvSizes[n=%D] = %D cannot be less than 1",n,recvSizes[n]);
1444:     from->starts[n+1] = from->starts[n] + recvSizes[n];
1445:     from->procs[n] = recvProcs[n];
1446:     for(i = from->starts[n]; i < from->starts[n]+recvSizes[n]; i++) {
1447:       from->indices[i] = recvIdx[i];
1448:     }
1449:   }
1450:   ctx->fromdata = (void *)from;

1452:   /* No local scatter optimization */
1453:   from->local.n      = 0;
1454:   from->local.vslots = 0;
1455:   to->local.n        = 0;
1456:   to->local.vslots   = 0;
1457:   from->local.nonmatching_computed = PETSC_FALSE;
1458:   from->local.n_nonmatching        = 0;
1459:   from->local.slots_nonmatching    = 0;
1460:   to->local.nonmatching_computed   = PETSC_FALSE;
1461:   to->local.n_nonmatching          = 0;
1462:   to->local.slots_nonmatching      = 0;

1464:   from->type = VEC_SCATTER_MPI_GENERAL;
1465:   to->type   = VEC_SCATTER_MPI_GENERAL;
1466:   from->bs = bs;
1467:   to->bs   = bs;

1469:   VecScatterCreateCommon_PtoS(from, to, ctx);
1470:   return(0);
1471: }

1473: /*
1474:    bs indicates how many elements there are in each block. Normally this would be 1.
1475: */
1478: PetscErrorCode VecScatterCreate_PtoS(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
1479: {
1480:   VecScatter_MPI_General *from,*to;
1481:   PetscMPIInt            size,rank,imdex,tag,n;
1482:   PetscInt               *source = PETSC_NULL,*lens = PETSC_NULL,*owners = PETSC_NULL;
1483:   PetscInt               *lowner = PETSC_NULL,*start = PETSC_NULL,lengthy,lengthx;
1484:   PetscInt               *nprocs = PETSC_NULL,i,j,idx,nsends,nrecvs;
1485:   PetscInt               *owner = PETSC_NULL,*starts = PETSC_NULL,count,slen;
1486:   PetscInt               *rvalues,*svalues,base,nmax,*values,*indx,nprocslocal;
1487:   MPI_Comm               comm;
1488:   MPI_Request            *send_waits = PETSC_NULL,*recv_waits = PETSC_NULL;
1489:   MPI_Status             recv_status,*send_status;
1490:   PetscErrorCode         ierr;

1493:   PetscObjectGetNewTag((PetscObject)ctx,&tag);
1494:   PetscObjectGetComm((PetscObject)xin,&comm);
1495:   MPI_Comm_rank(comm,&rank);
1496:   MPI_Comm_size(comm,&size);
1497:   owners = xin->map.range;
1498:   VecGetSize(yin,&lengthy);
1499:   VecGetSize(xin,&lengthx);

1501:   /*  first count number of contributors to each processor */
1502:   PetscMalloc2(2*size,PetscInt,&nprocs,nx,PetscInt,&owner);
1503:   PetscMemzero(nprocs,2*size*sizeof(PetscInt));
1504:   j      = 0;
1505:   nsends = 0;
1506:   for (i=0; i<nx; i++) {
1507:     idx = inidx[i];
1508:     if (idx < owners[j]) j = 0;
1509:     for (; j<size; j++) {
1510:       if (idx < owners[j+1]) {
1511:         if (!nprocs[2*j]++) nsends++;
1512:         nprocs[2*j+1] = 1;
1513:         owner[i] = j;
1514:         break;
1515:       }
1516:     }
1517:   }
1518:   nprocslocal    = nprocs[2*rank];
1519:   nprocs[2*rank] = nprocs[2*rank+1] = 0;
1520:   if (nprocslocal) nsends--;

1522:   /* inform other processors of number of messages and max length*/
1523:   PetscMaxSum(comm,nprocs,&nmax,&nrecvs);

1525:   /* post receives:   */
1526:   PetscMalloc4(nrecvs*nmax,PetscInt,&rvalues,nrecvs,PetscInt,&lens,nrecvs,PetscInt,&source,nrecvs,MPI_Request,&recv_waits);
1527:   for (i=0; i<nrecvs; i++) {
1528:     MPI_Irecv((rvalues+nmax*i),nmax,MPIU_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
1529:   }

1531:   /* do sends:
1532:      1) starts[i] gives the starting index in svalues for stuff going to 
1533:      the ith processor
1534:   */
1535:   PetscMalloc3(nx,PetscInt,&svalues,nsends,MPI_Request,&send_waits,size+1,PetscInt,&starts);
1536:   starts[0]  = 0;
1537:   for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
1538:   for (i=0; i<nx; i++) {
1539:     if (owner[i] != rank) {
1540:       svalues[starts[owner[i]]++] = inidx[i];
1541:     }
1542:   }

1544:   starts[0] = 0;
1545:   for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
1546:   count = 0;
1547:   for (i=0; i<size; i++) {
1548:     if (nprocs[2*i+1]) {
1549:       MPI_Isend(svalues+starts[i],nprocs[2*i],MPIU_INT,i,tag,comm,send_waits+count++);
1550:     }
1551:   }

1553:   /*  wait on receives */
1554:   count  = nrecvs;
1555:   slen   = 0;
1556:   while (count) {
1557:     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
1558:     /* unpack receives into our local space */
1559:     MPI_Get_count(&recv_status,MPIU_INT,&n);
1560:     source[imdex]  = recv_status.MPI_SOURCE;
1561:     lens[imdex]    = n;
1562:     slen          += n;
1563:     count--;
1564:   }
1565: 
1566:   /* allocate entire send scatter context */
1567:   PetscNew(VecScatter_MPI_General,&to);
1568:   to->n = nrecvs;
1569:   PetscMalloc(nrecvs*sizeof(MPI_Request),&to->requests);
1570:   PetscMalloc4(bs*slen,PetscScalar,&to->values,slen,PetscInt,&to->indices,nrecvs+1,PetscInt,&to->starts,
1571:                        nrecvs,PetscMPIInt,&to->procs);
1572:   PetscMalloc2(PetscMax(to->n,nsends),MPI_Status,&to->sstatus,PetscMax(to->n,nsends),MPI_Status,
1573:                        &to->rstatus);
1574:   ctx->todata   = (void*)to;
1575:   to->starts[0] = 0;

1577:   if (nrecvs) {
1578:     PetscMalloc(nrecvs*sizeof(PetscInt),&indx);
1579:     for (i=0; i<nrecvs; i++) indx[i] = i;
1580:     PetscSortIntWithPermutation(nrecvs,source,indx);

1582:     /* move the data into the send scatter */
1583:     base = owners[rank];
1584:     for (i=0; i<nrecvs; i++) {
1585:       to->starts[i+1] = to->starts[i] + lens[indx[i]];
1586:       to->procs[i]    = source[indx[i]];
1587:       values = rvalues + indx[i]*nmax;
1588:       for (j=0; j<lens[indx[i]]; j++) {
1589:         to->indices[to->starts[i] + j] = values[j] - base;
1590:       }
1591:     }
1592:     PetscFree(indx);
1593:   }
1594:   PetscFree4(rvalues,lens,source,recv_waits);
1595: 
1596:   /* allocate entire receive scatter context */
1597:   PetscNew(VecScatter_MPI_General,&from);
1598:   from->n        = nsends;

1600:   PetscMalloc(nsends*sizeof(MPI_Request),&from->requests);
1601:   PetscMalloc4((ny-nprocslocal)*bs,PetscScalar,&from->values,ny-nprocslocal,PetscInt,&from->indices,
1602:                       nsends+1,PetscInt,&from->starts,from->n,PetscMPIInt,&from->procs);
1603:   ctx->fromdata  = (void*)from;

1605:   /* move data into receive scatter */
1606:   PetscMalloc2(size,PetscInt,&lowner,nsends+1,PetscInt,&start);
1607:   count = 0; from->starts[0] = start[0] = 0;
1608:   for (i=0; i<size; i++) {
1609:     if (nprocs[2*i+1]) {
1610:       lowner[i]            = count;
1611:       from->procs[count++] = i;
1612:       from->starts[count]  = start[count] = start[count-1] + nprocs[2*i];
1613:     }
1614:   }
1615:   for (i=0; i<nx; i++) {
1616:     if (owner[i] != rank) {
1617:       from->indices[start[lowner[owner[i]]]++] = inidy[i];
1618:       if (inidy[i] >= lengthy) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
1619:     }
1620:   }
1621:   PetscFree2(lowner,start);
1622:   PetscFree2(nprocs,owner);
1623: 
1624:   /* wait on sends */
1625:   if (nsends) {
1626:     PetscMalloc(nsends*sizeof(MPI_Status),&send_status);
1627:     MPI_Waitall(nsends,send_waits,send_status);
1628:     PetscFree(send_status);
1629:   }
1630:   PetscFree3(svalues,send_waits,starts);

1632:   if (nprocslocal) {
1633:     PetscInt nt = from->local.n = to->local.n = nprocslocal;
1634:     /* we have a scatter to ourselves */
1635:     PetscMalloc(nt*sizeof(PetscInt),&to->local.vslots);
1636:     PetscMalloc(nt*sizeof(PetscInt),&from->local.vslots);
1637:     nt   = 0;
1638:     for (i=0; i<nx; i++) {
1639:       idx = inidx[i];
1640:       if (idx >= owners[rank] && idx < owners[rank+1]) {
1641:         to->local.vslots[nt]     = idx - owners[rank];
1642:         from->local.vslots[nt++] = inidy[i];
1643:         if (inidy[i] >= lengthy) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
1644:       }
1645:     }
1646:   } else {
1647:     from->local.n      = 0;
1648:     from->local.vslots = 0;
1649:     to->local.n        = 0;
1650:     to->local.vslots   = 0;
1651:   }
1652:   from->local.nonmatching_computed = PETSC_FALSE;
1653:   from->local.n_nonmatching        = 0;
1654:   from->local.slots_nonmatching    = 0;
1655:   to->local.nonmatching_computed   = PETSC_FALSE;
1656:   to->local.n_nonmatching          = 0;
1657:   to->local.slots_nonmatching      = 0;

1659:   from->type = VEC_SCATTER_MPI_GENERAL;
1660:   to->type   = VEC_SCATTER_MPI_GENERAL;
1661:   from->bs = bs;
1662:   to->bs   = bs;

1664:   VecScatterCreateCommon_PtoS(from,to,ctx);
1665:   return(0);
1666: }

1668: /*
1669:    bs indicates how many elements there are in each block. Normally this would be 1.
1670: */
1673: PetscErrorCode VecScatterCreateCommon_PtoS(VecScatter_MPI_General *from,VecScatter_MPI_General *to,VecScatter ctx)
1674: {
1675:   MPI_Comm       comm = ctx->comm;
1676:   PetscMPIInt    tag  = ctx->tag, tagr;
1677:   PetscInt       bs   = to->bs;
1678:   PetscMPIInt    size;
1679:   PetscInt       i, n;
1681: 
1683:   PetscObjectGetNewTag((PetscObject)ctx,&tagr);
1684:   ctx->destroy = VecScatterDestroy_PtoP;

1686:   PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&to->sendfirst);
1687:   from->sendfirst = to->sendfirst;

1689:   MPI_Comm_size(comm,&size);
1690:   /* check if the receives are ALL going into contiguous locations; if so can skip indexing */
1691:   to->contiq = PETSC_FALSE;
1692:   n = from->starts[from->n];
1693:   from->contiq = PETSC_TRUE;
1694:   for (i=1; i<n; i++) {
1695:     if (from->indices[i] != from->indices[i-1] + bs) {
1696:       from->contiq = PETSC_FALSE;
1697:       break;
1698:     }
1699:   }

1701:   PetscOptionsHasName(PETSC_NULL,"-vecscatter_alltoall",&to->use_alltoallv);
1702:   from->use_alltoallv = to->use_alltoallv;
1703:   if (from->use_alltoallv) PetscInfo(0,"Using MPI_Alltoallv() for scatter\n");
1704: #if defined(PETSC_HAVE_MPI_ALLTOALLW) 
1705:   if (to->use_alltoallv) {
1706:     PetscOptionsHasName(PETSC_NULL,"-vecscatter_nopack",&to->use_alltoallw);
1707:   }
1708:   from->use_alltoallw = to->use_alltoallw;
1709:   if (from->use_alltoallw) PetscInfo(0,"Using MPI_Alltoallw() for scatter\n");
1710: #endif

1712: #if defined(PETSC_HAVE_MPI_WIN_CREATE)
1713:   PetscOptionsHasName(PETSC_NULL,"-vecscatter_window",&to->use_window);
1714:   from->use_window = to->use_window;
1715: #endif

1717:   if (to->use_alltoallv) {

1719:     PetscMalloc2(size,PetscMPIInt,&to->counts,size,PetscMPIInt,&to->displs);
1720:     PetscMemzero(to->counts,size*sizeof(PetscMPIInt));
1721:     for (i=0; i<to->n; i++) {
1722:       to->counts[to->procs[i]] = bs*(to->starts[i+1] - to->starts[i]);
1723:     }
1724:     to->displs[0] = 0;
1725:     for (i=1; i<size; i++) {
1726:       to->displs[i] = to->displs[i-1] + to->counts[i-1];
1727:     }

1729:     PetscMalloc2(size,PetscMPIInt,&from->counts,size,PetscMPIInt,&from->displs);
1730:     PetscMemzero(from->counts,size*sizeof(PetscMPIInt));
1731:     for (i=0; i<from->n; i++) {
1732:       from->counts[from->procs[i]] = bs*(from->starts[i+1] - from->starts[i]);
1733:     }
1734:     from->displs[0] = 0;
1735:     for (i=1; i<size; i++) {
1736:       from->displs[i] = from->displs[i-1] + from->counts[i-1];
1737:     }
1738: #if defined(PETSC_HAVE_MPI_ALLTOALLW) 
1739:     if (to->use_alltoallw) {
1740:       ctx->packtogether = PETSC_FALSE;
1741:       PetscMalloc3(size,PetscMPIInt,&to->wcounts,size,PetscMPIInt,&to->wdispls,size,MPI_Datatype,&to->types);
1742:       PetscMemzero(to->wcounts,size*sizeof(PetscMPIInt));
1743:       PetscMemzero(to->wdispls,size*sizeof(PetscMPIInt));
1744:       for (i=0; i<size; i++) {
1745:         to->types[i] = MPIU_SCALAR;
1746:       }

1748:       for (i=0; i<to->n; i++) {
1749:         to->wcounts[to->procs[i]] = 1;
1750:         MPI_Type_create_indexed_block(to->starts[i+1]-to->starts[i],bs,to->indices+to->starts[i],MPIU_SCALAR,to->types+to->procs[i]);
1751:         MPI_Type_commit(to->types+to->procs[i]);
1752:       }
1753:       PetscMalloc3(size,PetscMPIInt,&from->wcounts,size,PetscMPIInt,&from->wdispls,size,MPI_Datatype,&from->types);
1754:       PetscMemzero(from->wcounts,size*sizeof(PetscMPIInt));
1755:       PetscMemzero(from->wdispls,size*sizeof(PetscMPIInt));
1756:       for (i=0; i<size; i++) {
1757:         from->types[i] = MPIU_SCALAR;
1758:       }
1759:       if (from->contiq) {
1760:         PetscInfo(0,"Scattered vector entries are stored contiquously, taking advantage of this with -vecscatter_alltoall\n");
1761:         for (i=0; i<from->n; i++) {
1762:           from->wcounts[from->procs[i]] = bs*(from->starts[i+1] - from->starts[i]);
1763:         }
1764:         if (from->n) from->wdispls[from->procs[0]] = sizeof(PetscScalar)*from->indices[0];
1765:         for (i=1; i<from->n; i++) {
1766:           from->wdispls[from->procs[i]] = from->wdispls[from->procs[i-1]] + sizeof(PetscScalar)*from->wcounts[from->procs[i-1]];
1767:         }
1768:       } else {
1769:         for (i=0; i<from->n; i++) {
1770:           from->wcounts[from->procs[i]] = 1;
1771:           MPI_Type_create_indexed_block(from->starts[i+1]-from->starts[i],bs,from->indices+from->starts[i],MPIU_SCALAR,from->types+from->procs[i]);
1772:           MPI_Type_commit(from->types+from->procs[i]);
1773:         }
1774:       }
1775:     }
1776: #else 
1777:     to->use_alltoallw   = PETSC_FALSE;
1778:     from->use_alltoallw = PETSC_FALSE;
1779: #endif
1780: #if defined(PETSC_HAVE_MPI_WIN_CREATE)
1781:   } else if (to->use_window) {
1782:     PetscMPIInt temptag,winsize;
1783:     MPI_Request *request;
1784:     MPI_Status  *status;
1785: 
1786:     PetscObjectGetNewTag((PetscObject)ctx,&temptag);
1787:     winsize = (to->n ? to->starts[to->n] : 0)*sizeof(PetscScalar);
1788:     MPI_Win_create(to->values ? to->values : MPI_BOTTOM,winsize,sizeof(PetscScalar),MPI_INFO_NULL,comm,&to->window);
1789:     PetscMalloc(to->n,&to->winstarts);
1790:     PetscMalloc2(to->n,MPI_Request,&request,to->n,MPI_Status,&status);
1791:     for (i=0; i<to->n; i++) {
1792:       MPI_Irecv(to->winstarts+i,1,MPIU_INT,to->procs[i],temptag,comm,request+i);
1793:     }
1794:     for (i=0; i<from->n; i++) {
1795:       MPI_Send(from->starts+i,1,MPIU_INT,from->procs[i],temptag,comm);
1796:     }
1797:     MPI_Waitall(to->n,request,status);
1798:     PetscFree2(request,status);

1800:     winsize = (from->n ? from->starts[from->n] : 0)*sizeof(PetscScalar);
1801:     MPI_Win_create(from->values ? from->values : MPI_BOTTOM,winsize,sizeof(PetscScalar),MPI_INFO_NULL,comm,&from->window);
1802:     PetscMalloc(from->n,&from->winstarts);
1803:     PetscMalloc2(from->n,MPI_Request,&request,from->n,MPI_Status,&status);
1804:     for (i=0; i<from->n; i++) {
1805:       MPI_Irecv(from->winstarts+i,1,MPIU_INT,from->procs[i],temptag,comm,request+i);
1806:     }
1807:     for (i=0; i<to->n; i++) {
1808:       MPI_Send(to->starts+i,1,MPIU_INT,to->procs[i],temptag,comm);
1809:     }
1810:     MPI_Waitall(from->n,request,status);
1811:     PetscFree2(request,status);
1812: #endif
1813:   } else {
1814:     PetscTruth  use_rsend = PETSC_FALSE, use_ssend = PETSC_FALSE;
1815:     PetscInt    *sstarts = to->starts,  *rstarts = from->starts;
1816:     PetscMPIInt *sprocs  = to->procs,   *rprocs  = from->procs;
1817:     MPI_Request *swaits  = to->requests,*rwaits  = from->requests;
1818:     MPI_Request *rev_swaits,*rev_rwaits;
1819:     PetscScalar *Ssvalues = to->values, *Srvalues = from->values;

1821:     /* allocate additional wait variables for the "reverse" scatter */
1822:     PetscMalloc(to->n*sizeof(MPI_Request),&rev_rwaits);
1823:     PetscMalloc(from->n*sizeof(MPI_Request),&rev_swaits);
1824:     to->rev_requests   = rev_rwaits;
1825:     from->rev_requests = rev_swaits;

1827:     /* Register the receives that you will use later (sends for scatter reverse) */
1828:     PetscOptionsHasName(PETSC_NULL,"-vecscatter_rsend",&use_rsend);
1829:     PetscOptionsHasName(PETSC_NULL,"-vecscatter_ssend",&use_ssend);
1830:     if (use_rsend) {
1831:       PetscInfo(0,"Using VecScatter ready receiver mode\n");
1832:       to->use_readyreceiver    = PETSC_TRUE;
1833:       from->use_readyreceiver  = PETSC_TRUE;
1834:     } else {
1835:       to->use_readyreceiver    = PETSC_FALSE;
1836:       from->use_readyreceiver  = PETSC_FALSE;
1837:     }
1838:     if (use_ssend) {
1839:       PetscInfo(0,"Using VecScatter Ssend mode\n");
1840:     }

1842:     for (i=0; i<from->n; i++) {
1843:       if (use_rsend) {
1844:         MPI_Rsend_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
1845:       } else if (use_ssend) {
1846:         MPI_Ssend_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
1847:       } else {
1848:         MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
1849:       }
1850:     }

1852:     for (i=0; i<to->n; i++) {
1853:       if (use_rsend) {
1854:         MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
1855:       } else if (use_ssend) {
1856:         MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
1857:       } else {
1858:         MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
1859:       }
1860:     }
1861:     /* Register receives for scatter and reverse */
1862:     for (i=0; i<from->n; i++) {
1863:       MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
1864:     }
1865:     for (i=0; i<to->n; i++) {
1866:       MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tagr,comm,rev_rwaits+i);
1867:     }
1868:     if (use_rsend) {
1869:       if (to->n)   {MPI_Startall_irecv(to->starts[to->n]*to->bs,to->n,to->rev_requests);}
1870:       if (from->n) {MPI_Startall_irecv(from->starts[from->n]*from->bs,from->n,from->requests);}
1871:       MPI_Barrier(comm);
1872:     }

1874:     ctx->copy      = VecScatterCopy_PtoP_X;
1875:   }
1876:   PetscInfo1(0,"Using blocksize %D scatter\n",bs);
1877:   switch (bs) {
1878:   case 12:
1879:     ctx->begin     = VecScatterBegin_12;
1880:     ctx->end       = VecScatterEnd_12;
1881:     break;
1882:   case 8:
1883:     ctx->begin     = VecScatterBegin_8;
1884:     ctx->end       = VecScatterEnd_8;
1885:     break;
1886:   case 7:
1887:     ctx->begin     = VecScatterBegin_7;
1888:     ctx->end       = VecScatterEnd_7;
1889:     break;
1890:   case 6:
1891:     ctx->begin     = VecScatterBegin_6;
1892:     ctx->end       = VecScatterEnd_6;
1893:     break;
1894:   case 5:
1895:     ctx->begin     = VecScatterBegin_5;
1896:     ctx->end       = VecScatterEnd_5;
1897:     break;
1898:   case 4:
1899:     ctx->begin     = VecScatterBegin_4;
1900:     ctx->end       = VecScatterEnd_4;
1901:     break;
1902:   case 3:
1903:     ctx->begin     = VecScatterBegin_3;
1904:     ctx->end       = VecScatterEnd_3;
1905:     break;
1906:   case 2:
1907:     ctx->begin     = VecScatterBegin_2;
1908:     ctx->end       = VecScatterEnd_2;
1909:     break;
1910:   case 1:
1911:     ctx->begin     = VecScatterBegin_1;
1912:     ctx->end       = VecScatterEnd_1;
1913:     break;
1914:   default:
1915:     SETERRQ(PETSC_ERR_SUP,"Blocksize not supported");
1916:   }
1917:   ctx->view      = VecScatterView_MPI;

1919:   /* Check if the local scatter is actually a copy; important special case */
1920:   if (to->local.n) {
1921:     VecScatterLocalOptimizeCopy_Private(&to->local,&from->local,bs);
1922:   }
1923:   return(0);
1924: }



1928: /* ------------------------------------------------------------------------------------*/
1929: /*
1930:          Scatter from local Seq vectors to a parallel vector. 
1931:          Reverses the order of the arguments, calls VecScatterCreate_PtoS() then
1932:          reverses the result.
1933: */
1936: PetscErrorCode VecScatterCreate_StoP(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
1937: {
1938:   PetscErrorCode         ierr;
1939:   MPI_Request            *waits;
1940:   VecScatter_MPI_General *to,*from;

1943:   VecScatterCreate_PtoS(ny,inidy,nx,inidx,yin,xin,bs,ctx);
1944:   to            = (VecScatter_MPI_General*)ctx->fromdata;
1945:   from          = (VecScatter_MPI_General*)ctx->todata;
1946:   ctx->todata   = (void*)to;
1947:   ctx->fromdata = (void*)from;
1948:   /* these two are special, they are ALWAYS stored in to struct */
1949:   to->sstatus   = from->sstatus;
1950:   to->rstatus   = from->rstatus;

1952:   from->sstatus = 0;
1953:   from->rstatus = 0;

1955:   waits              = from->rev_requests;
1956:   from->rev_requests = from->requests;
1957:   from->requests     = waits;
1958:   waits              = to->rev_requests;
1959:   to->rev_requests   = to->requests;
1960:   to->requests       = waits;
1961:   return(0);
1962: }

1964: /* ---------------------------------------------------------------------------------*/
1967: PetscErrorCode VecScatterCreate_PtoP(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,VecScatter ctx)
1968: {
1970:   PetscMPIInt    size,rank,tag,imdex,n;
1971:   PetscInt       *lens = PETSC_NULL,*owners = xin->map.range;
1972:   PetscInt       *nprocs = PETSC_NULL,i,j,idx,nsends,nrecvs,*local_inidx = PETSC_NULL,*local_inidy = PETSC_NULL;
1973:   PetscInt       *owner = PETSC_NULL,*starts = PETSC_NULL,count,slen;
1974:   PetscInt       *rvalues = PETSC_NULL,*svalues = PETSC_NULL,base,nmax,*values = PETSC_NULL,lastidx;
1975:   MPI_Comm       comm;
1976:   MPI_Request    *send_waits = PETSC_NULL,*recv_waits = PETSC_NULL;
1977:   MPI_Status     recv_status,*send_status = PETSC_NULL;
1978:   PetscTruth     duplicate = PETSC_FALSE;
1979: #if defined(PETSC_USE_DEBUG)
1980:   PetscTruth     found = PETSC_FALSE;
1981: #endif

1984:   PetscObjectGetNewTag((PetscObject)ctx,&tag);
1985:   PetscObjectGetComm((PetscObject)xin,&comm);
1986:   MPI_Comm_size(comm,&size);
1987:   MPI_Comm_rank(comm,&rank);
1988:   if (size == 1) {
1989:     VecScatterCreate_StoP(nx,inidx,ny,inidy,xin,yin,1,ctx);
1990:     return(0);
1991:   }

1993:   /*
1994:      Each processor ships off its inidx[j] and inidy[j] to the appropriate processor
1995:      They then call the StoPScatterCreate()
1996:   */
1997:   /*  first count number of contributors to each processor */
1998:   PetscMalloc3(2*size,PetscInt,&nprocs,nx,PetscInt,&owner,(size+1),PetscInt,&starts);
1999:   PetscMemzero(nprocs,2*size*sizeof(PetscInt));
2000:   lastidx = -1;
2001:   j       = 0;
2002:   for (i=0; i<nx; i++) {
2003:     /* if indices are NOT locally sorted, need to start search at the beginning */
2004:     if (lastidx > (idx = inidx[i])) j = 0;
2005:     lastidx = idx;
2006:     for (; j<size; j++) {
2007:       if (idx >= owners[j] && idx < owners[j+1]) {
2008:         nprocs[2*j]++;
2009:         nprocs[2*j+1] = 1;
2010:         owner[i] = j;
2011: #if defined(PETSC_USE_DEBUG)
2012:         found = PETSC_TRUE;
2013: #endif
2014:         break;
2015:       }
2016:     }
2017: #if defined(PETSC_USE_DEBUG)
2018:     if (!found) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"Index %D out of range",idx);
2019:     found = PETSC_FALSE;
2020: #endif
2021:   }
2022:   nsends = 0;  for (i=0; i<size; i++) { nsends += nprocs[2*i+1];}

2024:   /* inform other processors of number of messages and max length*/
2025:   PetscMaxSum(comm,nprocs,&nmax,&nrecvs);

2027:   /* post receives:   */
2028:   PetscMalloc6(2*nrecvs*nmax,PetscInt,&rvalues,2*nx,PetscInt,&svalues,2*nrecvs,PetscInt,&lens,nrecvs,MPI_Request,&recv_waits,nsends,MPI_Request,&send_waits,nsends,MPI_Status,&send_status);

2030:   for (i=0; i<nrecvs; i++) {
2031:     MPI_Irecv(rvalues+2*nmax*i,2*nmax,MPIU_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
2032:   }

2034:   /* do sends:
2035:       1) starts[i] gives the starting index in svalues for stuff going to 
2036:          the ith processor
2037:   */
2038:   starts[0]= 0;
2039:   for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
2040:   for (i=0; i<nx; i++) {
2041:     svalues[2*starts[owner[i]]]       = inidx[i];
2042:     svalues[1 + 2*starts[owner[i]]++] = inidy[i];
2043:   }

2045:   starts[0] = 0;
2046:   for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
2047:   count = 0;
2048:   for (i=0; i<size; i++) {
2049:     if (nprocs[2*i+1]) {
2050:       MPI_Isend(svalues+2*starts[i],2*nprocs[2*i],MPIU_INT,i,tag,comm,send_waits+count);
2051:       count++;
2052:     }
2053:   }
2054:   PetscFree3(nprocs,owner,starts);

2056:   /*  wait on receives */
2057:   count = nrecvs;
2058:   slen  = 0;
2059:   while (count) {
2060:     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
2061:     /* unpack receives into our local space */
2062:     MPI_Get_count(&recv_status,MPIU_INT,&n);
2063:     lens[imdex]  =  n/2;
2064:     slen         += n/2;
2065:     count--;
2066:   }
2067: 
2068:   PetscMalloc2(slen,PetscInt,&local_inidx,slen,PetscInt,&local_inidy);
2069:   base  = owners[rank];
2070:   count = 0;
2071:   for (i=0; i<nrecvs; i++) {
2072:     values = rvalues + 2*i*nmax;
2073:     for (j=0; j<lens[i]; j++) {
2074:       local_inidx[count]   = values[2*j] - base;
2075:       local_inidy[count++] = values[2*j+1];
2076:     }
2077:   }

2079:   /* wait on sends */
2080:   if (nsends) {MPI_Waitall(nsends,send_waits,send_status);}
2081:   PetscFree6(rvalues,svalues,lens,recv_waits,send_waits,send_status);

2083:   /*
2084:      should sort and remove duplicates from local_inidx,local_inidy 
2085:   */

2087: #if defined(do_it_slow)
2088:   /* sort on the from index */
2089:   PetscSortIntWithArray(slen,local_inidx,local_inidy);
2090:   start = 0;
2091:   while (start < slen) {
2092:     count = start+1;
2093:     last  = local_inidx[start];
2094:     while (count < slen && last == local_inidx[count]) count++;
2095:     if (count > start + 1) { /* found 2 or more same local_inidx[] in a row */
2096:       /* sort on to index */
2097:       PetscSortInt(count-start,local_inidy+start);
2098:     }
2099:     /* remove duplicates; not most efficient way, but probably good enough */
2100:     i = start;
2101:     while (i < count-1) {
2102:       if (local_inidy[i] != local_inidy[i+1]) {
2103:         i++;
2104:       } else { /* found a duplicate */
2105:         duplicate = PETSC_TRUE;
2106:         for (j=i; j<slen-1; j++) {
2107:           local_inidx[j] = local_inidx[j+1];
2108:           local_inidy[j] = local_inidy[j+1];
2109:         }
2110:         slen--;
2111:         count--;
2112:       }
2113:     }
2114:     start = count;
2115:   }
2116: #endif
2117:   if (duplicate) {
2118:     PetscInfo(0,"Duplicate from to indices passed in VecScatterCreate(), they are ignored\n");
2119:   }
2120:   VecScatterCreate_StoP(slen,local_inidx,slen,local_inidy,xin,yin,1,ctx);
2121:   PetscFree2(local_inidx,local_inidy);
2122:   return(0);
2123: }