Actual source code: vpscat.c
1: #define PETSCVEC_DLL
3: /*
4: Defines parallel vector scatters.
5: */
7: #include include/private/isimpl.h
8: #include include/private/vecimpl.h
9: #include src/vec/vec/impls/dvecimpl.h
10: #include src/vec/vec/impls/mpi/pvecimpl.h
11: #include petscsys.h
15: PetscErrorCode VecScatterView_MPI(VecScatter ctx,PetscViewer viewer)
16: {
17: VecScatter_MPI_General *to=(VecScatter_MPI_General*)ctx->todata;
18: VecScatter_MPI_General *from=(VecScatter_MPI_General*)ctx->fromdata;
19: PetscErrorCode ierr;
20: PetscInt i;
21: PetscMPIInt rank;
22: PetscViewerFormat format;
23: PetscTruth iascii;
26: PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_ASCII,&iascii);
27: if (iascii) {
28: MPI_Comm_rank(ctx->comm,&rank);
29: PetscViewerGetFormat(viewer,&format);
30: if (format == PETSC_VIEWER_ASCII_INFO) {
31: PetscInt nsend_max,nrecv_max,lensend_max,lenrecv_max,alldata,itmp;
33: MPI_Reduce(&to->n,&nsend_max,1,MPIU_INT,MPI_MAX,0,ctx->comm);
34: MPI_Reduce(&from->n,&nrecv_max,1,MPIU_INT,MPI_MAX,0,ctx->comm);
35: itmp = to->starts[to->n+1];
36: MPI_Reduce(&itmp,&lensend_max,1,MPIU_INT,MPI_MAX,0,ctx->comm);
37: itmp = from->starts[from->n+1];
38: MPI_Reduce(&itmp,&lenrecv_max,1,MPIU_INT,MPI_MAX,0,ctx->comm);
39: MPI_Reduce(&itmp,&alldata,1,MPIU_INT,MPI_SUM,0,ctx->comm);
41: PetscViewerASCIIPrintf(viewer,"VecScatter statistics\n");
42: PetscViewerASCIIPrintf(viewer," Maximum number sends %D\n",nsend_max);
43: PetscViewerASCIIPrintf(viewer," Maximum number receives %D\n",nrecv_max);
44: PetscViewerASCIIPrintf(viewer," Maximum data sent %D\n",(int)(lensend_max*to->bs*sizeof(PetscScalar)));
45: PetscViewerASCIIPrintf(viewer," Maximum data received %D\n",(int)(lenrecv_max*to->bs*sizeof(PetscScalar)));
46: PetscViewerASCIIPrintf(viewer," Total data sent %D\n",(int)(alldata*to->bs*sizeof(PetscScalar)));
48: } else {
49: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number sends = %D; Number to self = %D\n",rank,to->n,to->local.n);
50: if (to->n) {
51: for (i=0; i<to->n; i++){
52: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D length = %D to whom %D\n",rank,i,to->starts[i+1]-to->starts[i],to->procs[i]);
53: }
54: PetscViewerASCIISynchronizedPrintf(viewer,"Now the indices for all remote sends (in order by process sent to)\n");
55: for (i=0; i<to->starts[to->n]; i++){
56: PetscViewerASCIISynchronizedPrintf(viewer,"[%d]%D \n",rank,to->indices[i]);
57: }
58: }
60: PetscViewerASCIISynchronizedPrintf(viewer,"[%d]Number receives = %D; Number from self = %D\n",rank,from->n,from->local.n);
61: if (from->n) {
62: for (i=0; i<from->n; i++){
63: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D length %D from whom %D\n",rank,i,from->starts[i+1]-from->starts[i],from->procs[i]);
64: }
66: PetscViewerASCIISynchronizedPrintf(viewer,"Now the indices for all remote receives (in order by process received from)\n");
67: for (i=0; i<from->starts[from->n]; i++){
68: PetscViewerASCIISynchronizedPrintf(viewer,"[%d]%D \n",rank,from->indices[i]);
69: }
70: }
71: if (to->local.n) {
72: PetscViewerASCIISynchronizedPrintf(viewer,"[%d]Indices for local part of scatter\n",rank);
73: for (i=0; i<to->local.n; i++){
74: PetscViewerASCIISynchronizedPrintf(viewer,"[%d]From %D to %D \n",rank,from->local.vslots[i],to->local.vslots[i]);
75: }
76: }
78: PetscViewerFlush(viewer);
79: }
80: } else {
81: SETERRQ1(PETSC_ERR_SUP,"Viewer type %s not supported for this scatter",((PetscObject)viewer)->type_name);
82: }
83: return(0);
84: }
86: /* -----------------------------------------------------------------------------------*/
87: /*
88: The next routine determines what part of the local part of the scatter is an
89: exact copy of values into their current location. We check this here and
90: then know that we need not perform that portion of the scatter.
91: */
94: PetscErrorCode VecScatterLocalOptimize_Private(VecScatter_Seq_General *to,VecScatter_Seq_General *from)
95: {
96: PetscInt n = to->n,n_nonmatching = 0,i,*to_slots = to->vslots,*from_slots = from->vslots;
98: PetscInt *nto_slots,*nfrom_slots,j = 0;
99:
101: for (i=0; i<n; i++) {
102: if (to_slots[i] != from_slots[i]) n_nonmatching++;
103: }
105: if (!n_nonmatching) {
106: to->nonmatching_computed = PETSC_TRUE;
107: to->n_nonmatching = from->n_nonmatching = 0;
108: PetscInfo1(0,"Reduced %D to 0\n", n);
109: } else if (n_nonmatching == n) {
110: to->nonmatching_computed = PETSC_FALSE;
111: PetscInfo(0,"All values non-matching\n");
112: } else {
113: to->nonmatching_computed= PETSC_TRUE;
114: to->n_nonmatching = from->n_nonmatching = n_nonmatching;
115: PetscMalloc(n_nonmatching*sizeof(PetscInt),&nto_slots);
116: PetscMalloc(n_nonmatching*sizeof(PetscInt),&nfrom_slots);
117: to->slots_nonmatching = nto_slots;
118: from->slots_nonmatching = nfrom_slots;
119: for (i=0; i<n; i++) {
120: if (to_slots[i] != from_slots[i]) {
121: nto_slots[j] = to_slots[i];
122: nfrom_slots[j] = from_slots[i];
123: j++;
124: }
125: }
126: PetscInfo2(0,"Reduced %D to %D\n",n,n_nonmatching);
127: }
128: return(0);
129: }
131: /* --------------------------------------------------------------------------------------*/
133: /* -------------------------------------------------------------------------------------*/
136: PetscErrorCode VecScatterDestroy_PtoP(VecScatter ctx)
137: {
138: VecScatter_MPI_General *to = (VecScatter_MPI_General*)ctx->todata;
139: VecScatter_MPI_General *from = (VecScatter_MPI_General*)ctx->fromdata;
140: PetscErrorCode ierr;
141: PetscInt i;
144: if (to->use_readyreceiver) {
145: /*
146: Since we have already posted sends we must cancel them before freeing
147: the requests
148: */
149: for (i=0; i<from->n; i++) {
150: MPI_Cancel(from->requests+i);
151: }
152: for (i=0; i<to->n; i++) {
153: MPI_Cancel(to->rev_requests+i);
154: }
155: }
157: #if defined(PETSC_HAVE_MPI_ALLTOALLW)
158: if (to->use_alltoallw) {
159: PetscFree3(to->wcounts,to->wdispls,to->types);
160: PetscFree3(from->wcounts,from->wdispls,from->types);
161: }
162: #endif
164: #if defined(PETSC_HAVE_MPI_WINDOW)
165: if (to->use_window) {
166: MPI_Win_free(&from->window);
167: MPI_Win_free(&to->window);
168: }
169: #endif
171: if (to->use_alltoallv) {
172: PetscFree2(to->counts,to->displs);
173: PetscFree2(from->counts,from->displs);
174: }
176: /* release MPI resources obtained with MPI_Send_init() and MPI_Recv_init() */
177: /*
178: IBM's PE version of MPI has a bug where freeing these guys will screw up later
179: message passing.
180: */
181: #if !defined(PETSC_HAVE_BROKEN_REQUEST_FREE)
182: if (!to->use_alltoallv) { /* currently the to->requests etc are ALWAYS allocated even if not used */
183: if (to->requests) {
184: for (i=0; i<to->n; i++) {
185: MPI_Request_free(to->requests + i);
186: }
187: }
188: if (to->rev_requests) {
189: for (i=0; i<to->n; i++) {
190: MPI_Request_free(to->rev_requests + i);
191: }
192: }
193: }
194: /*
195: MPICH could not properly cancel requests thus with ready receiver mode we
196: cannot free the requests. It may be fixed now, if not then put the following
197: code inside a if !to->use_readyreceiver) {
198: */
199: if (!to->use_alltoallv) { /* currently the from->requests etc are ALWAYS allocated even if not used */
200: if (from->requests) {
201: for (i=0; i<from->n; i++) {
202: MPI_Request_free(from->requests + i);
203: }
204: }
206: if (from->rev_requests) {
207: for (i=0; i<from->n; i++) {
208: MPI_Request_free(from->rev_requests + i);
209: }
210: }
211: }
212: #endif
214: PetscFree(to->local.vslots);
215: PetscFree(from->local.vslots);
216: PetscFree2(to->counts,to->displs);
217: PetscFree2(from->counts,from->displs);
218: PetscFree(to->local.slots_nonmatching);
219: PetscFree(from->local.slots_nonmatching);
220: PetscFree(to->rev_requests);
221: PetscFree(from->rev_requests);
222: PetscFree(to->requests);
223: PetscFree(from->requests);
224: PetscFree4(to->values,to->indices,to->starts,to->procs);
225: PetscFree2(to->sstatus,to->rstatus);
226: PetscFree4(from->values,from->indices,from->starts,from->procs);
227: PetscFree(from);
228: PetscFree(to);
229: PetscHeaderDestroy(ctx);
230: return(0);
231: }
235: /* --------------------------------------------------------------------------------------*/
236: /*
237: Special optimization to see if the local part of the scatter is actually
238: a copy. The scatter routines call PetscMemcpy() instead.
239:
240: */
243: PetscErrorCode VecScatterLocalOptimizeCopy_Private(VecScatter_Seq_General *to,VecScatter_Seq_General *from,PetscInt bs)
244: {
245: PetscInt n = to->n,i,*to_slots = to->vslots,*from_slots = from->vslots;
246: PetscInt to_start,from_start;
250: to_start = to_slots[0];
251: from_start = from_slots[0];
253: for (i=1; i<n; i++) {
254: to_start += bs;
255: from_start += bs;
256: if (to_slots[i] != to_start) return(0);
257: if (from_slots[i] != from_start) return(0);
258: }
259: to->is_copy = PETSC_TRUE;
260: to->copy_start = to_slots[0];
261: to->copy_length = bs*sizeof(PetscScalar)*n;
262: from->is_copy = PETSC_TRUE;
263: from->copy_start = from_slots[0];
264: from->copy_length = bs*sizeof(PetscScalar)*n;
265: PetscInfo(0,"Local scatter is a copy, optimizing for it\n");
266: return(0);
267: }
269: /* --------------------------------------------------------------------------------------*/
273: PetscErrorCode VecScatterCopy_PtoP_X(VecScatter in,VecScatter out)
274: {
275: VecScatter_MPI_General *in_to = (VecScatter_MPI_General*)in->todata;
276: VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
277: PetscErrorCode ierr;
278: PetscInt ny,bs = in_from->bs;
281: out->begin = in->begin;
282: out->end = in->end;
283: out->copy = in->copy;
284: out->destroy = in->destroy;
285: out->view = in->view;
287: /* allocate entire send scatter context */
288: PetscNew(VecScatter_MPI_General,&out_to);
289: PetscNew(VecScatter_MPI_General,&out_from);
291: ny = in_to->starts[in_to->n];
292: out_to->n = in_to->n;
293: out_to->type = in_to->type;
294: out_to->sendfirst = in_to->sendfirst;
296: PetscMalloc(out_to->n*sizeof(MPI_Request),&out_to->requests);
297: PetscMalloc4(bs*ny,PetscScalar,&out_to->values,ny,PetscInt,&out_to->indices,out_to->n+1,PetscInt,&out_to->starts,
298: out_to->n,PetscMPIInt,&out_to->procs);
299: PetscMalloc2(PetscMax(in_to->n,in_from->n),MPI_Status,&out_to->sstatus,PetscMax(in_to->n,in_from->n),MPI_Status,
300: &out_to->rstatus);
301: PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(PetscInt));
302: PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(PetscInt));
303: PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(PetscMPIInt));
304:
305: out->todata = (void*)out_to;
306: out_to->local.n = in_to->local.n;
307: out_to->local.nonmatching_computed = PETSC_FALSE;
308: out_to->local.n_nonmatching = 0;
309: out_to->local.slots_nonmatching = 0;
310: if (in_to->local.n) {
311: PetscMalloc(in_to->local.n*sizeof(PetscInt),&out_to->local.vslots);
312: PetscMalloc(in_from->local.n*sizeof(PetscInt),&out_from->local.vslots);
313: PetscMemcpy(out_to->local.vslots,in_to->local.vslots,in_to->local.n*sizeof(PetscInt));
314: PetscMemcpy(out_from->local.vslots,in_from->local.vslots,in_from->local.n*sizeof(PetscInt));
315: } else {
316: out_to->local.vslots = 0;
317: out_from->local.vslots = 0;
318: }
320: /* allocate entire receive context */
321: out_from->type = in_from->type;
322: ny = in_from->starts[in_from->n];
323: out_from->n = in_from->n;
324: out_from->sendfirst = in_from->sendfirst;
326: PetscMalloc(out_from->n*sizeof(MPI_Request),&out_from->requests);
327: PetscMalloc4(ny*bs,PetscScalar,&out_from->values,ny,PetscInt,&out_from->indices,
328: out_from->n+1,PetscInt,&out_from->starts,out_from->n,PetscMPIInt,&out_from->procs);
329: PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(PetscInt));
330: PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(PetscInt));
331: PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(PetscMPIInt));
332: out->fromdata = (void*)out_from;
333: out_from->local.n = in_from->local.n;
334: out_from->local.nonmatching_computed = PETSC_FALSE;
335: out_from->local.n_nonmatching = 0;
336: out_from->local.slots_nonmatching = 0;
338: /*
339: set up the request arrays for use with isend_init() and irecv_init()
340: */
341: {
342: PetscMPIInt tag;
343: MPI_Comm comm;
344: PetscInt *sstarts = out_to->starts, *rstarts = out_from->starts;
345: PetscMPIInt *sprocs = out_to->procs, *rprocs = out_from->procs;
346: PetscInt i;
347: PetscTruth flg;
348: MPI_Request *swaits = out_to->requests,*rwaits = out_from->requests;
349: MPI_Request *rev_swaits,*rev_rwaits;
350: PetscScalar *Ssvalues = out_to->values, *Srvalues = out_from->values;
352: PetscMalloc(in_to->n*sizeof(MPI_Request),&out_to->rev_requests);
353: PetscMalloc(in_from->n*sizeof(MPI_Request),&out_from->rev_requests);
355: rev_rwaits = out_to->rev_requests;
356: rev_swaits = out_from->rev_requests;
358: out_from->bs = out_to->bs = bs;
359: tag = out->tag;
360: comm = out->comm;
362: /* Register the receives that you will use later (sends for scatter reverse) */
363: for (i=0; i<out_from->n; i++) {
364: MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
365: MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rev_swaits+i);
366: }
368: PetscOptionsHasName(PETSC_NULL,"-vecscatter_rsend",&flg);
369: if (flg) {
370: out_to->use_readyreceiver = PETSC_TRUE;
371: out_from->use_readyreceiver = PETSC_TRUE;
372: for (i=0; i<out_to->n; i++) {
373: MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
374: }
375: if (out_from->n) {MPI_Startall_irecv(out_from->starts[out_from->n]*out_from->bs,out_from->n,out_from->requests);}
376: MPI_Barrier(comm);
377: PetscInfo(0,"Using VecScatter ready receiver mode\n");
378: } else {
379: out_to->use_readyreceiver = PETSC_FALSE;
380: out_from->use_readyreceiver = PETSC_FALSE;
381: PetscOptionsHasName(PETSC_NULL,"-vecscatter_ssend",&flg);
382: if (flg) {
383: PetscInfo(0,"Using VecScatter Ssend mode\n");
384: }
385: for (i=0; i<out_to->n; i++) {
386: if (!flg) {
387: MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
388: } else {
389: MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
390: }
391: }
392: }
393: /* Register receives for scatter reverse */
394: for (i=0; i<out_to->n; i++) {
395: MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,rev_rwaits+i);
396: }
397: }
399: return(0);
400: }
401: /* --------------------------------------------------------------------------------------------------
402: Packs and unpacks the message data into send or from receive buffers.
404: These could be generated automatically.
406: Fortran kernels etc. could be used.
407: */
408: PETSC_STATIC_INLINE void Pack_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
409: {
410: PetscInt i;
411: for (i=0; i<n; i++) {
412: y[i] = x[indicesx[i]];
413: }
414: }
415: PETSC_STATIC_INLINE void UnPack_1(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
416: {
417: PetscInt i;
418: switch (addv) {
419: case INSERT_VALUES:
420: for (i=0; i<n; i++) {
421: y[indicesy[i]] = x[i];
422: }
423: break;
424: case ADD_VALUES:
425: for (i=0; i<n; i++) {
426: y[indicesy[i]] += x[i];
427: }
428: break;
429: #if !defined(PETSC_USE_COMPLEX)
430: case MAX_VALUES:
431: for (i=0; i<n; i++) {
432: y[indicesy[i]] = PetscMax(y[indicesy[i]],x[i]);
433: }
434: #else
435: case MAX_VALUES:
436: #endif
437: case NOT_SET_VALUES:
438: break;
439: }
440: }
442: PETSC_STATIC_INLINE void Scatter_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
443: {
444: PetscInt i;
445: switch (addv) {
446: case INSERT_VALUES:
447: for (i=0; i<n; i++) {
448: y[indicesy[i]] = x[indicesx[i]];
449: }
450: break;
451: case ADD_VALUES:
452: for (i=0; i<n; i++) {
453: y[indicesy[i]] += x[indicesx[i]];
454: }
455: break;
456: #if !defined(PETSC_USE_COMPLEX)
457: case MAX_VALUES:
458: for (i=0; i<n; i++) {
459: y[indicesy[i]] = PetscMax(y[indicesy[i]],x[indicesx[i]]);
460: }
461: #else
462: case MAX_VALUES:
463: #endif
464: case NOT_SET_VALUES:
465: break;
466: }
467: }
469: /* ----------------------------------------------------------------------------------------------- */
470: PETSC_STATIC_INLINE void Pack_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
471: {
472: PetscInt i,idx;
474: for (i=0; i<n; i++) {
475: idx = *indicesx++;
476: y[0] = x[idx];
477: y[1] = x[idx+1];
478: y += 2;
479: }
480: }
481: PETSC_STATIC_INLINE void UnPack_2(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
482: {
483: PetscInt i,idy;
485: switch (addv) {
486: case INSERT_VALUES:
487: for (i=0; i<n; i++) {
488: idy = *indicesy++;
489: y[idy] = x[0];
490: y[idy+1] = x[1];
491: x += 2;
492: }
493: break;
494: case ADD_VALUES:
495: for (i=0; i<n; i++) {
496: idy = *indicesy++;
497: y[idy] += x[0];
498: y[idy+1] += x[1];
499: x += 2;
500: }
501: break;
502: #if !defined(PETSC_USE_COMPLEX)
503: case MAX_VALUES:
504: for (i=0; i<n; i++) {
505: idy = *indicesy++;
506: y[idy] = PetscMax(y[idy],x[0]);
507: y[idy+1] = PetscMax(y[idy+1],x[1]);
508: x += 2;
509: }
510: #else
511: case MAX_VALUES:
512: #endif
513: case NOT_SET_VALUES:
514: break;
515: }
516: }
518: PETSC_STATIC_INLINE void Scatter_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
519: {
520: PetscInt i,idx,idy;
522: switch (addv) {
523: case INSERT_VALUES:
524: for (i=0; i<n; i++) {
525: idx = *indicesx++;
526: idy = *indicesy++;
527: y[idy] = x[idx];
528: y[idy+1] = x[idx+1];
529: }
530: break;
531: case ADD_VALUES:
532: for (i=0; i<n; i++) {
533: idx = *indicesx++;
534: idy = *indicesy++;
535: y[idy] += x[idx];
536: y[idy+1] += x[idx+1];
537: }
538: break;
539: #if !defined(PETSC_USE_COMPLEX)
540: case MAX_VALUES:
541: for (i=0; i<n; i++) {
542: idx = *indicesx++;
543: idy = *indicesy++;
544: y[idy] = PetscMax(y[idy],x[idx]);
545: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
546: }
547: #else
548: case MAX_VALUES:
549: #endif
550: case NOT_SET_VALUES:
551: break;
552: }
553: }
554: /* ----------------------------------------------------------------------------------------------- */
555: PETSC_STATIC_INLINE void Pack_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
556: {
557: PetscInt i,idx;
559: for (i=0; i<n; i++) {
560: idx = *indicesx++;
561: y[0] = x[idx];
562: y[1] = x[idx+1];
563: y[2] = x[idx+2];
564: y += 3;
565: }
566: }
567: PETSC_STATIC_INLINE void UnPack_3(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
568: {
569: PetscInt i,idy;
571: switch (addv) {
572: case INSERT_VALUES:
573: for (i=0; i<n; i++) {
574: idy = *indicesy++;
575: y[idy] = x[0];
576: y[idy+1] = x[1];
577: y[idy+2] = x[2];
578: x += 3;
579: }
580: break;
581: case ADD_VALUES:
582: for (i=0; i<n; i++) {
583: idy = *indicesy++;
584: y[idy] += x[0];
585: y[idy+1] += x[1];
586: y[idy+2] += x[2];
587: x += 3;
588: }
589: break;
590: #if !defined(PETSC_USE_COMPLEX)
591: case MAX_VALUES:
592: for (i=0; i<n; i++) {
593: idy = *indicesy++;
594: y[idy] = PetscMax(y[idy],x[0]);
595: y[idy+1] = PetscMax(y[idy+1],x[1]);
596: y[idy+2] = PetscMax(y[idy+2],x[2]);
597: x += 3;
598: }
599: #else
600: case MAX_VALUES:
601: #endif
602: case NOT_SET_VALUES:
603: break;
604: }
605: }
607: PETSC_STATIC_INLINE void Scatter_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
608: {
609: PetscInt i,idx,idy;
611: switch (addv) {
612: case INSERT_VALUES:
613: for (i=0; i<n; i++) {
614: idx = *indicesx++;
615: idy = *indicesy++;
616: y[idy] = x[idx];
617: y[idy+1] = x[idx+1];
618: y[idy+2] = x[idx+2];
619: }
620: break;
621: case ADD_VALUES:
622: for (i=0; i<n; i++) {
623: idx = *indicesx++;
624: idy = *indicesy++;
625: y[idy] += x[idx];
626: y[idy+1] += x[idx+1];
627: y[idy+2] += x[idx+2];
628: }
629: break;
630: #if !defined(PETSC_USE_COMPLEX)
631: case MAX_VALUES:
632: for (i=0; i<n; i++) {
633: idx = *indicesx++;
634: idy = *indicesy++;
635: y[idy] = PetscMax(y[idy],x[idx]);
636: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
637: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
638: }
639: #else
640: case MAX_VALUES:
641: #endif
642: case NOT_SET_VALUES:
643: break;
644: }
645: }
646: /* ----------------------------------------------------------------------------------------------- */
647: PETSC_STATIC_INLINE void Pack_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
648: {
649: PetscInt i,idx;
651: for (i=0; i<n; i++) {
652: idx = *indicesx++;
653: y[0] = x[idx];
654: y[1] = x[idx+1];
655: y[2] = x[idx+2];
656: y[3] = x[idx+3];
657: y += 4;
658: }
659: }
660: PETSC_STATIC_INLINE void UnPack_4(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
661: {
662: PetscInt i,idy;
664: switch (addv) {
665: case INSERT_VALUES:
666: for (i=0; i<n; i++) {
667: idy = *indicesy++;
668: y[idy] = x[0];
669: y[idy+1] = x[1];
670: y[idy+2] = x[2];
671: y[idy+3] = x[3];
672: x += 4;
673: }
674: break;
675: case ADD_VALUES:
676: for (i=0; i<n; i++) {
677: idy = *indicesy++;
678: y[idy] += x[0];
679: y[idy+1] += x[1];
680: y[idy+2] += x[2];
681: y[idy+3] += x[3];
682: x += 4;
683: }
684: break;
685: #if !defined(PETSC_USE_COMPLEX)
686: case MAX_VALUES:
687: for (i=0; i<n; i++) {
688: idy = *indicesy++;
689: y[idy] = PetscMax(y[idy],x[0]);
690: y[idy+1] = PetscMax(y[idy+1],x[1]);
691: y[idy+2] = PetscMax(y[idy+2],x[2]);
692: y[idy+3] = PetscMax(y[idy+3],x[3]);
693: x += 4;
694: }
695: #else
696: case MAX_VALUES:
697: #endif
698: case NOT_SET_VALUES:
699: break;
700: }
701: }
703: PETSC_STATIC_INLINE void Scatter_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
704: {
705: PetscInt i,idx,idy;
707: switch (addv) {
708: case INSERT_VALUES:
709: for (i=0; i<n; i++) {
710: idx = *indicesx++;
711: idy = *indicesy++;
712: y[idy] = x[idx];
713: y[idy+1] = x[idx+1];
714: y[idy+2] = x[idx+2];
715: y[idy+3] = x[idx+3];
716: }
717: break;
718: case ADD_VALUES:
719: for (i=0; i<n; i++) {
720: idx = *indicesx++;
721: idy = *indicesy++;
722: y[idy] += x[idx];
723: y[idy+1] += x[idx+1];
724: y[idy+2] += x[idx+2];
725: y[idy+3] += x[idx+3];
726: }
727: break;
728: #if !defined(PETSC_USE_COMPLEX)
729: case MAX_VALUES:
730: for (i=0; i<n; i++) {
731: idx = *indicesx++;
732: idy = *indicesy++;
733: y[idy] = PetscMax(y[idy],x[idx]);
734: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
735: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
736: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
737: }
738: #else
739: case MAX_VALUES:
740: #endif
741: case NOT_SET_VALUES:
742: break;
743: }
744: }
745: /* ----------------------------------------------------------------------------------------------- */
746: PETSC_STATIC_INLINE void Pack_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
747: {
748: PetscInt i,idx;
750: for (i=0; i<n; i++) {
751: idx = *indicesx++;
752: y[0] = x[idx];
753: y[1] = x[idx+1];
754: y[2] = x[idx+2];
755: y[3] = x[idx+3];
756: y[4] = x[idx+4];
757: y += 5;
758: }
759: }
760: PETSC_STATIC_INLINE void UnPack_5(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
761: {
762: PetscInt i,idy;
764: switch (addv) {
765: case INSERT_VALUES:
766: for (i=0; i<n; i++) {
767: idy = *indicesy++;
768: y[idy] = x[0];
769: y[idy+1] = x[1];
770: y[idy+2] = x[2];
771: y[idy+3] = x[3];
772: y[idy+4] = x[4];
773: x += 5;
774: }
775: break;
776: case ADD_VALUES:
777: for (i=0; i<n; i++) {
778: idy = *indicesy++;
779: y[idy] += x[0];
780: y[idy+1] += x[1];
781: y[idy+2] += x[2];
782: y[idy+3] += x[3];
783: y[idy+4] += x[4];
784: x += 5;
785: }
786: break;
787: #if !defined(PETSC_USE_COMPLEX)
788: case MAX_VALUES:
789: for (i=0; i<n; i++) {
790: idy = *indicesy++;
791: y[idy] = PetscMax(y[idy],x[0]);
792: y[idy+1] = PetscMax(y[idy+1],x[1]);
793: y[idy+2] = PetscMax(y[idy+2],x[2]);
794: y[idy+3] = PetscMax(y[idy+3],x[3]);
795: y[idy+4] = PetscMax(y[idy+4],x[4]);
796: x += 5;
797: }
798: #else
799: case MAX_VALUES:
800: #endif
801: case NOT_SET_VALUES:
802: break;
803: }
804: }
806: PETSC_STATIC_INLINE void Scatter_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
807: {
808: PetscInt i,idx,idy;
810: switch (addv) {
811: case INSERT_VALUES:
812: for (i=0; i<n; i++) {
813: idx = *indicesx++;
814: idy = *indicesy++;
815: y[idy] = x[idx];
816: y[idy+1] = x[idx+1];
817: y[idy+2] = x[idx+2];
818: y[idy+3] = x[idx+3];
819: y[idy+4] = x[idx+4];
820: }
821: break;
822: case ADD_VALUES:
823: for (i=0; i<n; i++) {
824: idx = *indicesx++;
825: idy = *indicesy++;
826: y[idy] += x[idx];
827: y[idy+1] += x[idx+1];
828: y[idy+2] += x[idx+2];
829: y[idy+3] += x[idx+3];
830: y[idy+4] += x[idx+4];
831: }
832: break;
833: #if !defined(PETSC_USE_COMPLEX)
834: case MAX_VALUES:
835: for (i=0; i<n; i++) {
836: idx = *indicesx++;
837: idy = *indicesy++;
838: y[idy] = PetscMax(y[idy],x[idx]);
839: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
840: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
841: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
842: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
843: }
844: #else
845: case MAX_VALUES:
846: #endif
847: case NOT_SET_VALUES:
848: break;
849: }
850: }
851: /* ----------------------------------------------------------------------------------------------- */
852: PETSC_STATIC_INLINE void Pack_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
853: {
854: PetscInt i,idx;
856: for (i=0; i<n; i++) {
857: idx = *indicesx++;
858: y[0] = x[idx];
859: y[1] = x[idx+1];
860: y[2] = x[idx+2];
861: y[3] = x[idx+3];
862: y[4] = x[idx+4];
863: y[5] = x[idx+5];
864: y += 6;
865: }
866: }
867: PETSC_STATIC_INLINE void UnPack_6(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
868: {
869: PetscInt i,idy;
871: switch (addv) {
872: case INSERT_VALUES:
873: for (i=0; i<n; i++) {
874: idy = *indicesy++;
875: y[idy] = x[0];
876: y[idy+1] = x[1];
877: y[idy+2] = x[2];
878: y[idy+3] = x[3];
879: y[idy+4] = x[4];
880: y[idy+5] = x[5];
881: x += 6;
882: }
883: break;
884: case ADD_VALUES:
885: for (i=0; i<n; i++) {
886: idy = *indicesy++;
887: y[idy] += x[0];
888: y[idy+1] += x[1];
889: y[idy+2] += x[2];
890: y[idy+3] += x[3];
891: y[idy+4] += x[4];
892: y[idy+5] += x[5];
893: x += 6;
894: }
895: break;
896: #if !defined(PETSC_USE_COMPLEX)
897: case MAX_VALUES:
898: for (i=0; i<n; i++) {
899: idy = *indicesy++;
900: y[idy] = PetscMax(y[idy],x[0]);
901: y[idy+1] = PetscMax(y[idy+1],x[1]);
902: y[idy+2] = PetscMax(y[idy+2],x[2]);
903: y[idy+3] = PetscMax(y[idy+3],x[3]);
904: y[idy+4] = PetscMax(y[idy+4],x[4]);
905: y[idy+5] = PetscMax(y[idy+5],x[5]);
906: x += 6;
907: }
908: #else
909: case MAX_VALUES:
910: #endif
911: case NOT_SET_VALUES:
912: break;
913: }
914: }
916: PETSC_STATIC_INLINE void Scatter_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
917: {
918: PetscInt i,idx,idy;
920: switch (addv) {
921: case INSERT_VALUES:
922: for (i=0; i<n; i++) {
923: idx = *indicesx++;
924: idy = *indicesy++;
925: y[idy] = x[idx];
926: y[idy+1] = x[idx+1];
927: y[idy+2] = x[idx+2];
928: y[idy+3] = x[idx+3];
929: y[idy+4] = x[idx+4];
930: y[idy+5] = x[idx+5];
931: }
932: break;
933: case ADD_VALUES:
934: for (i=0; i<n; i++) {
935: idx = *indicesx++;
936: idy = *indicesy++;
937: y[idy] += x[idx];
938: y[idy+1] += x[idx+1];
939: y[idy+2] += x[idx+2];
940: y[idy+3] += x[idx+3];
941: y[idy+4] += x[idx+4];
942: y[idy+5] += x[idx+5];
943: }
944: break;
945: #if !defined(PETSC_USE_COMPLEX)
946: case MAX_VALUES:
947: for (i=0; i<n; i++) {
948: idx = *indicesx++;
949: idy = *indicesy++;
950: y[idy] = PetscMax(y[idy],x[idx]);
951: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
952: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
953: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
954: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
955: y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
956: }
957: #else
958: case MAX_VALUES:
959: #endif
960: case NOT_SET_VALUES:
961: break;
962: }
963: }
964: /* ----------------------------------------------------------------------------------------------- */
965: PETSC_STATIC_INLINE void Pack_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
966: {
967: PetscInt i,idx;
969: for (i=0; i<n; i++) {
970: idx = *indicesx++;
971: y[0] = x[idx];
972: y[1] = x[idx+1];
973: y[2] = x[idx+2];
974: y[3] = x[idx+3];
975: y[4] = x[idx+4];
976: y[5] = x[idx+5];
977: y[6] = x[idx+6];
978: y += 7;
979: }
980: }
981: PETSC_STATIC_INLINE void UnPack_7(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
982: {
983: PetscInt i,idy;
985: switch (addv) {
986: case INSERT_VALUES:
987: for (i=0; i<n; i++) {
988: idy = *indicesy++;
989: y[idy] = x[0];
990: y[idy+1] = x[1];
991: y[idy+2] = x[2];
992: y[idy+3] = x[3];
993: y[idy+4] = x[4];
994: y[idy+5] = x[5];
995: y[idy+6] = x[6];
996: x += 7;
997: }
998: break;
999: case ADD_VALUES:
1000: for (i=0; i<n; i++) {
1001: idy = *indicesy++;
1002: y[idy] += x[0];
1003: y[idy+1] += x[1];
1004: y[idy+2] += x[2];
1005: y[idy+3] += x[3];
1006: y[idy+4] += x[4];
1007: y[idy+5] += x[5];
1008: y[idy+6] += x[6];
1009: x += 7;
1010: }
1011: break;
1012: #if !defined(PETSC_USE_COMPLEX)
1013: case MAX_VALUES:
1014: for (i=0; i<n; i++) {
1015: idy = *indicesy++;
1016: y[idy] = PetscMax(y[idy],x[0]);
1017: y[idy+1] = PetscMax(y[idy+1],x[1]);
1018: y[idy+2] = PetscMax(y[idy+2],x[2]);
1019: y[idy+3] = PetscMax(y[idy+3],x[3]);
1020: y[idy+4] = PetscMax(y[idy+4],x[4]);
1021: y[idy+5] = PetscMax(y[idy+5],x[5]);
1022: y[idy+6] = PetscMax(y[idy+6],x[6]);
1023: x += 7;
1024: }
1025: #else
1026: case MAX_VALUES:
1027: #endif
1028: case NOT_SET_VALUES:
1029: break;
1030: }
1031: }
1033: PETSC_STATIC_INLINE void Scatter_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1034: {
1035: PetscInt i,idx,idy;
1037: switch (addv) {
1038: case INSERT_VALUES:
1039: for (i=0; i<n; i++) {
1040: idx = *indicesx++;
1041: idy = *indicesy++;
1042: y[idy] = x[idx];
1043: y[idy+1] = x[idx+1];
1044: y[idy+2] = x[idx+2];
1045: y[idy+3] = x[idx+3];
1046: y[idy+4] = x[idx+4];
1047: y[idy+5] = x[idx+5];
1048: y[idy+6] = x[idx+6];
1049: }
1050: break;
1051: case ADD_VALUES:
1052: for (i=0; i<n; i++) {
1053: idx = *indicesx++;
1054: idy = *indicesy++;
1055: y[idy] += x[idx];
1056: y[idy+1] += x[idx+1];
1057: y[idy+2] += x[idx+2];
1058: y[idy+3] += x[idx+3];
1059: y[idy+4] += x[idx+4];
1060: y[idy+5] += x[idx+5];
1061: y[idy+6] += x[idx+6];
1062: }
1063: break;
1064: #if !defined(PETSC_USE_COMPLEX)
1065: case MAX_VALUES:
1066: for (i=0; i<n; i++) {
1067: idx = *indicesx++;
1068: idy = *indicesy++;
1069: y[idy] = PetscMax(y[idy],x[idx]);
1070: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1071: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1072: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1073: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1074: y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1075: y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1076: }
1077: #else
1078: case MAX_VALUES:
1079: #endif
1080: case NOT_SET_VALUES:
1081: break;
1082: }
1083: }
1084: /* ----------------------------------------------------------------------------------------------- */
1085: PETSC_STATIC_INLINE void Pack_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
1086: {
1087: PetscInt i,idx;
1089: for (i=0; i<n; i++) {
1090: idx = *indicesx++;
1091: y[0] = x[idx];
1092: y[1] = x[idx+1];
1093: y[2] = x[idx+2];
1094: y[3] = x[idx+3];
1095: y[4] = x[idx+4];
1096: y[5] = x[idx+5];
1097: y[6] = x[idx+6];
1098: y[7] = x[idx+7];
1099: y += 8;
1100: }
1101: }
1102: PETSC_STATIC_INLINE void UnPack_8(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1103: {
1104: PetscInt i,idy;
1106: switch (addv) {
1107: case INSERT_VALUES:
1108: for (i=0; i<n; i++) {
1109: idy = *indicesy++;
1110: y[idy] = x[0];
1111: y[idy+1] = x[1];
1112: y[idy+2] = x[2];
1113: y[idy+3] = x[3];
1114: y[idy+4] = x[4];
1115: y[idy+5] = x[5];
1116: y[idy+6] = x[6];
1117: y[idy+7] = x[7];
1118: x += 8;
1119: }
1120: break;
1121: case ADD_VALUES:
1122: for (i=0; i<n; i++) {
1123: idy = *indicesy++;
1124: y[idy] += x[0];
1125: y[idy+1] += x[1];
1126: y[idy+2] += x[2];
1127: y[idy+3] += x[3];
1128: y[idy+4] += x[4];
1129: y[idy+5] += x[5];
1130: y[idy+6] += x[6];
1131: y[idy+7] += x[7];
1132: x += 8;
1133: }
1134: break;
1135: #if !defined(PETSC_USE_COMPLEX)
1136: case MAX_VALUES:
1137: for (i=0; i<n; i++) {
1138: idy = *indicesy++;
1139: y[idy] = PetscMax(y[idy],x[0]);
1140: y[idy+1] = PetscMax(y[idy+1],x[1]);
1141: y[idy+2] = PetscMax(y[idy+2],x[2]);
1142: y[idy+3] = PetscMax(y[idy+3],x[3]);
1143: y[idy+4] = PetscMax(y[idy+4],x[4]);
1144: y[idy+5] = PetscMax(y[idy+5],x[5]);
1145: y[idy+6] = PetscMax(y[idy+6],x[6]);
1146: y[idy+7] = PetscMax(y[idy+7],x[7]);
1147: x += 8;
1148: }
1149: #else
1150: case MAX_VALUES:
1151: #endif
1152: case NOT_SET_VALUES:
1153: break;
1154: }
1155: }
1157: PETSC_STATIC_INLINE void Scatter_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1158: {
1159: PetscInt i,idx,idy;
1161: switch (addv) {
1162: case INSERT_VALUES:
1163: for (i=0; i<n; i++) {
1164: idx = *indicesx++;
1165: idy = *indicesy++;
1166: y[idy] = x[idx];
1167: y[idy+1] = x[idx+1];
1168: y[idy+2] = x[idx+2];
1169: y[idy+3] = x[idx+3];
1170: y[idy+4] = x[idx+4];
1171: y[idy+5] = x[idx+5];
1172: y[idy+6] = x[idx+6];
1173: y[idy+7] = x[idx+7];
1174: }
1175: break;
1176: case ADD_VALUES:
1177: for (i=0; i<n; i++) {
1178: idx = *indicesx++;
1179: idy = *indicesy++;
1180: y[idy] += x[idx];
1181: y[idy+1] += x[idx+1];
1182: y[idy+2] += x[idx+2];
1183: y[idy+3] += x[idx+3];
1184: y[idy+4] += x[idx+4];
1185: y[idy+5] += x[idx+5];
1186: y[idy+6] += x[idx+6];
1187: y[idy+7] += x[idx+7];
1188: }
1189: break;
1190: #if !defined(PETSC_USE_COMPLEX)
1191: case MAX_VALUES:
1192: for (i=0; i<n; i++) {
1193: idx = *indicesx++;
1194: idy = *indicesy++;
1195: y[idy] = PetscMax(y[idy],x[idx]);
1196: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1197: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1198: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1199: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1200: y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1201: y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1202: y[idy+7] = PetscMax(y[idy+7],x[idx+7]);
1203: }
1204: #else
1205: case MAX_VALUES:
1206: #endif
1207: case NOT_SET_VALUES:
1208: break;
1209: }
1210: }
1212: /* ----------------------------------------------------------------------------------------------- */
1213: PETSC_STATIC_INLINE void Pack_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
1214: {
1215: PetscInt i,idx;
1217: for (i=0; i<n; i++) {
1218: idx = *indicesx++;
1219: y[0] = x[idx];
1220: y[1] = x[idx+1];
1221: y[2] = x[idx+2];
1222: y[3] = x[idx+3];
1223: y[4] = x[idx+4];
1224: y[5] = x[idx+5];
1225: y[6] = x[idx+6];
1226: y[7] = x[idx+7];
1227: y[8] = x[idx+8];
1228: y[9] = x[idx+9];
1229: y[10] = x[idx+10];
1230: y[11] = x[idx+11];
1231: y += 12;
1232: }
1233: }
1234: PETSC_STATIC_INLINE void UnPack_12(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1235: {
1236: PetscInt i,idy;
1238: switch (addv) {
1239: case INSERT_VALUES:
1240: for (i=0; i<n; i++) {
1241: idy = *indicesy++;
1242: y[idy] = x[0];
1243: y[idy+1] = x[1];
1244: y[idy+2] = x[2];
1245: y[idy+3] = x[3];
1246: y[idy+4] = x[4];
1247: y[idy+5] = x[5];
1248: y[idy+6] = x[6];
1249: y[idy+7] = x[7];
1250: y[idy+8] = x[8];
1251: y[idy+9] = x[9];
1252: y[idy+10] = x[10];
1253: y[idy+11] = x[11];
1254: x += 12;
1255: }
1256: break;
1257: case ADD_VALUES:
1258: for (i=0; i<n; i++) {
1259: idy = *indicesy++;
1260: y[idy] += x[0];
1261: y[idy+1] += x[1];
1262: y[idy+2] += x[2];
1263: y[idy+3] += x[3];
1264: y[idy+4] += x[4];
1265: y[idy+5] += x[5];
1266: y[idy+6] += x[6];
1267: y[idy+7] += x[7];
1268: y[idy+8] += x[8];
1269: y[idy+9] += x[9];
1270: y[idy+10] += x[10];
1271: y[idy+11] += x[11];
1272: x += 12;
1273: }
1274: break;
1275: #if !defined(PETSC_USE_COMPLEX)
1276: case MAX_VALUES:
1277: for (i=0; i<n; i++) {
1278: idy = *indicesy++;
1279: y[idy] = PetscMax(y[idy],x[0]);
1280: y[idy+1] = PetscMax(y[idy+1],x[1]);
1281: y[idy+2] = PetscMax(y[idy+2],x[2]);
1282: y[idy+3] = PetscMax(y[idy+3],x[3]);
1283: y[idy+4] = PetscMax(y[idy+4],x[4]);
1284: y[idy+5] = PetscMax(y[idy+5],x[5]);
1285: y[idy+6] = PetscMax(y[idy+6],x[6]);
1286: y[idy+7] = PetscMax(y[idy+7],x[7]);
1287: y[idy+8] = PetscMax(y[idy+8],x[8]);
1288: y[idy+9] = PetscMax(y[idy+9],x[9]);
1289: y[idy+10] = PetscMax(y[idy+10],x[10]);
1290: y[idy+11] = PetscMax(y[idy+11],x[11]);
1291: x += 12;
1292: }
1293: #else
1294: case MAX_VALUES:
1295: #endif
1296: case NOT_SET_VALUES:
1297: break;
1298: }
1299: }
1301: PETSC_STATIC_INLINE void Scatter_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1302: {
1303: PetscInt i,idx,idy;
1305: switch (addv) {
1306: case INSERT_VALUES:
1307: for (i=0; i<n; i++) {
1308: idx = *indicesx++;
1309: idy = *indicesy++;
1310: y[idy] = x[idx];
1311: y[idy+1] = x[idx+1];
1312: y[idy+2] = x[idx+2];
1313: y[idy+3] = x[idx+3];
1314: y[idy+4] = x[idx+4];
1315: y[idy+5] = x[idx+5];
1316: y[idy+6] = x[idx+6];
1317: y[idy+7] = x[idx+7];
1318: y[idy+8] = x[idx+8];
1319: y[idy+9] = x[idx+9];
1320: y[idy+10] = x[idx+10];
1321: y[idy+11] = x[idx+11];
1322: }
1323: break;
1324: case ADD_VALUES:
1325: for (i=0; i<n; i++) {
1326: idx = *indicesx++;
1327: idy = *indicesy++;
1328: y[idy] += x[idx];
1329: y[idy+1] += x[idx+1];
1330: y[idy+2] += x[idx+2];
1331: y[idy+3] += x[idx+3];
1332: y[idy+4] += x[idx+4];
1333: y[idy+5] += x[idx+5];
1334: y[idy+6] += x[idx+6];
1335: y[idy+7] += x[idx+7];
1336: y[idy+8] += x[idx+8];
1337: y[idy+9] += x[idx+9];
1338: y[idy+10] += x[idx+10];
1339: y[idy+11] += x[idx+11];
1340: }
1341: break;
1342: #if !defined(PETSC_USE_COMPLEX)
1343: case MAX_VALUES:
1344: for (i=0; i<n; i++) {
1345: idx = *indicesx++;
1346: idy = *indicesy++;
1347: y[idy] = PetscMax(y[idy],x[idx]);
1348: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1349: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1350: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1351: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1352: y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1353: y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1354: y[idy+7] = PetscMax(y[idy+7],x[idx+7]);
1355: y[idy+8] = PetscMax(y[idy+8],x[idx+8]);
1356: y[idy+9] = PetscMax(y[idy+9],x[idx+9]);
1357: y[idy+10] = PetscMax(y[idy+10],x[idx+10]);
1358: y[idy+11] = PetscMax(y[idy+11],x[idx+11]);
1359: }
1360: #else
1361: case MAX_VALUES:
1362: #endif
1363: case NOT_SET_VALUES:
1364: break;
1365: }
1366: }
1368: /* Create the VecScatterBegin/End_P for our chosen block sizes */
1369: #define BS 1
1370: #include src/vec/vec/utils/vpscat.h
1371: #define BS 2
1372: #include src/vec/vec/utils/vpscat.h
1373: #define BS 3
1374: #include src/vec/vec/utils/vpscat.h
1375: #define BS 4
1376: #include src/vec/vec/utils/vpscat.h
1377: #define BS 5
1378: #include src/vec/vec/utils/vpscat.h
1379: #define BS 6
1380: #include src/vec/vec/utils/vpscat.h
1381: #define BS 7
1382: #include src/vec/vec/utils/vpscat.h
1383: #define BS 8
1384: #include src/vec/vec/utils/vpscat.h
1385: #define BS 12
1386: #include src/vec/vec/utils/vpscat.h
1388: /* ==========================================================================================*/
1390: /* create parallel to sequential scatter context */
1392: PetscErrorCode VecScatterCreateCommon_PtoS(VecScatter_MPI_General *,VecScatter_MPI_General *,VecScatter);
1396: /*
1397: sendSizes[] and recvSizes[] cannot have any 0 entries. If you want to support having 0 entries you need
1398: to change the code below to "compress out" the sendProcs[] and recvProcs[] entries that have 0 entries.
1400: Probably does not handle sends to self properly. It should remove those from the counts that are used
1401: in allocating space inside of the from struct
1402: */
1403: PetscErrorCode VecScatterCreateLocal_PtoS(PetscInt nsends,const PetscInt sendSizes[],const PetscInt sendProcs[],const PetscInt sendIdx[],PetscInt nrecvs,const PetscInt recvSizes[],const PetscInt recvProcs[],const PetscInt recvIdx[],PetscInt bs,VecScatter ctx)
1404: {
1405: VecScatter_MPI_General *from, *to;
1406: PetscInt sendSize, recvSize;
1407: PetscInt n, i;
1410: /* allocate entire send scatter context */
1411: PetscNew(VecScatter_MPI_General,&to);
1412: to->n = nsends;
1413: for(n = 0, sendSize = 0; n < to->n; n++) {sendSize += sendSizes[n];}
1414: PetscMalloc(to->n*sizeof(MPI_Request),&to->requests);
1415: PetscMalloc4(bs*sendSize,PetscScalar,&to->values,
1416: sendSize,PetscInt,&to->indices,
1417: to->n+1,PetscInt,&to->starts,
1418: to->n,PetscMPIInt,&to->procs);
1419: PetscMalloc2(PetscMax(to->n,nrecvs),MPI_Status,&to->sstatus,
1420: PetscMax(to->n,nrecvs),MPI_Status,&to->rstatus);
1421: to->starts[0] = 0;
1422: for(n = 0; n < to->n; n++) {
1423: if (sendSizes[n] <=0 ) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,"sendSizes[n=%D] = %D cannot be less than 1",n,sendSizes[n]);
1424: to->starts[n+1] = to->starts[n] + sendSizes[n];
1425: to->procs[n] = sendProcs[n];
1426: for(i = to->starts[n]; i < to->starts[n]+sendSizes[n]; i++) {
1427: to->indices[i] = sendIdx[i];
1428: }
1429: }
1430: ctx->todata = (void *) to;
1432: /* allocate entire receive scatter context */
1433: PetscNew(VecScatter_MPI_General,&from);
1434: from->n = nrecvs;
1435: for(n = 0, recvSize = 0; n < from->n; n++) {recvSize += recvSizes[n];}
1436: PetscMalloc(from->n*sizeof(MPI_Request),&from->requests);
1437: PetscMalloc4(bs*recvSize,PetscScalar,&from->values,
1438: recvSize,PetscInt,&from->indices,
1439: from->n+1,PetscInt,&from->starts,
1440: from->n,PetscMPIInt,&from->procs);
1441: from->starts[0] = 0;
1442: for(n = 0; n < from->n; n++) {
1443: if (recvSizes[n] <=0 ) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,"recvSizes[n=%D] = %D cannot be less than 1",n,recvSizes[n]);
1444: from->starts[n+1] = from->starts[n] + recvSizes[n];
1445: from->procs[n] = recvProcs[n];
1446: for(i = from->starts[n]; i < from->starts[n]+recvSizes[n]; i++) {
1447: from->indices[i] = recvIdx[i];
1448: }
1449: }
1450: ctx->fromdata = (void *)from;
1452: /* No local scatter optimization */
1453: from->local.n = 0;
1454: from->local.vslots = 0;
1455: to->local.n = 0;
1456: to->local.vslots = 0;
1457: from->local.nonmatching_computed = PETSC_FALSE;
1458: from->local.n_nonmatching = 0;
1459: from->local.slots_nonmatching = 0;
1460: to->local.nonmatching_computed = PETSC_FALSE;
1461: to->local.n_nonmatching = 0;
1462: to->local.slots_nonmatching = 0;
1464: from->type = VEC_SCATTER_MPI_GENERAL;
1465: to->type = VEC_SCATTER_MPI_GENERAL;
1466: from->bs = bs;
1467: to->bs = bs;
1469: VecScatterCreateCommon_PtoS(from, to, ctx);
1470: return(0);
1471: }
1473: /*
1474: bs indicates how many elements there are in each block. Normally this would be 1.
1475: */
1478: PetscErrorCode VecScatterCreate_PtoS(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
1479: {
1480: VecScatter_MPI_General *from,*to;
1481: PetscMPIInt size,rank,imdex,tag,n;
1482: PetscInt *source = PETSC_NULL,*lens = PETSC_NULL,*owners = PETSC_NULL;
1483: PetscInt *lowner = PETSC_NULL,*start = PETSC_NULL,lengthy,lengthx;
1484: PetscInt *nprocs = PETSC_NULL,i,j,idx,nsends,nrecvs;
1485: PetscInt *owner = PETSC_NULL,*starts = PETSC_NULL,count,slen;
1486: PetscInt *rvalues,*svalues,base,nmax,*values,*indx,nprocslocal;
1487: MPI_Comm comm;
1488: MPI_Request *send_waits = PETSC_NULL,*recv_waits = PETSC_NULL;
1489: MPI_Status recv_status,*send_status;
1490: PetscErrorCode ierr;
1493: PetscObjectGetNewTag((PetscObject)ctx,&tag);
1494: PetscObjectGetComm((PetscObject)xin,&comm);
1495: MPI_Comm_rank(comm,&rank);
1496: MPI_Comm_size(comm,&size);
1497: owners = xin->map.range;
1498: VecGetSize(yin,&lengthy);
1499: VecGetSize(xin,&lengthx);
1501: /* first count number of contributors to each processor */
1502: PetscMalloc2(2*size,PetscInt,&nprocs,nx,PetscInt,&owner);
1503: PetscMemzero(nprocs,2*size*sizeof(PetscInt));
1504: j = 0;
1505: nsends = 0;
1506: for (i=0; i<nx; i++) {
1507: idx = inidx[i];
1508: if (idx < owners[j]) j = 0;
1509: for (; j<size; j++) {
1510: if (idx < owners[j+1]) {
1511: if (!nprocs[2*j]++) nsends++;
1512: nprocs[2*j+1] = 1;
1513: owner[i] = j;
1514: break;
1515: }
1516: }
1517: }
1518: nprocslocal = nprocs[2*rank];
1519: nprocs[2*rank] = nprocs[2*rank+1] = 0;
1520: if (nprocslocal) nsends--;
1522: /* inform other processors of number of messages and max length*/
1523: PetscMaxSum(comm,nprocs,&nmax,&nrecvs);
1525: /* post receives: */
1526: PetscMalloc4(nrecvs*nmax,PetscInt,&rvalues,nrecvs,PetscInt,&lens,nrecvs,PetscInt,&source,nrecvs,MPI_Request,&recv_waits);
1527: for (i=0; i<nrecvs; i++) {
1528: MPI_Irecv((rvalues+nmax*i),nmax,MPIU_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
1529: }
1531: /* do sends:
1532: 1) starts[i] gives the starting index in svalues for stuff going to
1533: the ith processor
1534: */
1535: PetscMalloc3(nx,PetscInt,&svalues,nsends,MPI_Request,&send_waits,size+1,PetscInt,&starts);
1536: starts[0] = 0;
1537: for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
1538: for (i=0; i<nx; i++) {
1539: if (owner[i] != rank) {
1540: svalues[starts[owner[i]]++] = inidx[i];
1541: }
1542: }
1544: starts[0] = 0;
1545: for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
1546: count = 0;
1547: for (i=0; i<size; i++) {
1548: if (nprocs[2*i+1]) {
1549: MPI_Isend(svalues+starts[i],nprocs[2*i],MPIU_INT,i,tag,comm,send_waits+count++);
1550: }
1551: }
1553: /* wait on receives */
1554: count = nrecvs;
1555: slen = 0;
1556: while (count) {
1557: MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
1558: /* unpack receives into our local space */
1559: MPI_Get_count(&recv_status,MPIU_INT,&n);
1560: source[imdex] = recv_status.MPI_SOURCE;
1561: lens[imdex] = n;
1562: slen += n;
1563: count--;
1564: }
1565:
1566: /* allocate entire send scatter context */
1567: PetscNew(VecScatter_MPI_General,&to);
1568: to->n = nrecvs;
1569: PetscMalloc(nrecvs*sizeof(MPI_Request),&to->requests);
1570: PetscMalloc4(bs*slen,PetscScalar,&to->values,slen,PetscInt,&to->indices,nrecvs+1,PetscInt,&to->starts,
1571: nrecvs,PetscMPIInt,&to->procs);
1572: PetscMalloc2(PetscMax(to->n,nsends),MPI_Status,&to->sstatus,PetscMax(to->n,nsends),MPI_Status,
1573: &to->rstatus);
1574: ctx->todata = (void*)to;
1575: to->starts[0] = 0;
1577: if (nrecvs) {
1578: PetscMalloc(nrecvs*sizeof(PetscInt),&indx);
1579: for (i=0; i<nrecvs; i++) indx[i] = i;
1580: PetscSortIntWithPermutation(nrecvs,source,indx);
1582: /* move the data into the send scatter */
1583: base = owners[rank];
1584: for (i=0; i<nrecvs; i++) {
1585: to->starts[i+1] = to->starts[i] + lens[indx[i]];
1586: to->procs[i] = source[indx[i]];
1587: values = rvalues + indx[i]*nmax;
1588: for (j=0; j<lens[indx[i]]; j++) {
1589: to->indices[to->starts[i] + j] = values[j] - base;
1590: }
1591: }
1592: PetscFree(indx);
1593: }
1594: PetscFree4(rvalues,lens,source,recv_waits);
1595:
1596: /* allocate entire receive scatter context */
1597: PetscNew(VecScatter_MPI_General,&from);
1598: from->n = nsends;
1600: PetscMalloc(nsends*sizeof(MPI_Request),&from->requests);
1601: PetscMalloc4((ny-nprocslocal)*bs,PetscScalar,&from->values,ny-nprocslocal,PetscInt,&from->indices,
1602: nsends+1,PetscInt,&from->starts,from->n,PetscMPIInt,&from->procs);
1603: ctx->fromdata = (void*)from;
1605: /* move data into receive scatter */
1606: PetscMalloc2(size,PetscInt,&lowner,nsends+1,PetscInt,&start);
1607: count = 0; from->starts[0] = start[0] = 0;
1608: for (i=0; i<size; i++) {
1609: if (nprocs[2*i+1]) {
1610: lowner[i] = count;
1611: from->procs[count++] = i;
1612: from->starts[count] = start[count] = start[count-1] + nprocs[2*i];
1613: }
1614: }
1615: for (i=0; i<nx; i++) {
1616: if (owner[i] != rank) {
1617: from->indices[start[lowner[owner[i]]]++] = inidy[i];
1618: if (inidy[i] >= lengthy) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
1619: }
1620: }
1621: PetscFree2(lowner,start);
1622: PetscFree2(nprocs,owner);
1623:
1624: /* wait on sends */
1625: if (nsends) {
1626: PetscMalloc(nsends*sizeof(MPI_Status),&send_status);
1627: MPI_Waitall(nsends,send_waits,send_status);
1628: PetscFree(send_status);
1629: }
1630: PetscFree3(svalues,send_waits,starts);
1632: if (nprocslocal) {
1633: PetscInt nt = from->local.n = to->local.n = nprocslocal;
1634: /* we have a scatter to ourselves */
1635: PetscMalloc(nt*sizeof(PetscInt),&to->local.vslots);
1636: PetscMalloc(nt*sizeof(PetscInt),&from->local.vslots);
1637: nt = 0;
1638: for (i=0; i<nx; i++) {
1639: idx = inidx[i];
1640: if (idx >= owners[rank] && idx < owners[rank+1]) {
1641: to->local.vslots[nt] = idx - owners[rank];
1642: from->local.vslots[nt++] = inidy[i];
1643: if (inidy[i] >= lengthy) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
1644: }
1645: }
1646: } else {
1647: from->local.n = 0;
1648: from->local.vslots = 0;
1649: to->local.n = 0;
1650: to->local.vslots = 0;
1651: }
1652: from->local.nonmatching_computed = PETSC_FALSE;
1653: from->local.n_nonmatching = 0;
1654: from->local.slots_nonmatching = 0;
1655: to->local.nonmatching_computed = PETSC_FALSE;
1656: to->local.n_nonmatching = 0;
1657: to->local.slots_nonmatching = 0;
1659: from->type = VEC_SCATTER_MPI_GENERAL;
1660: to->type = VEC_SCATTER_MPI_GENERAL;
1661: from->bs = bs;
1662: to->bs = bs;
1664: VecScatterCreateCommon_PtoS(from,to,ctx);
1665: return(0);
1666: }
1668: /*
1669: bs indicates how many elements there are in each block. Normally this would be 1.
1670: */
1673: PetscErrorCode VecScatterCreateCommon_PtoS(VecScatter_MPI_General *from,VecScatter_MPI_General *to,VecScatter ctx)
1674: {
1675: MPI_Comm comm = ctx->comm;
1676: PetscMPIInt tag = ctx->tag, tagr;
1677: PetscInt bs = to->bs;
1678: PetscMPIInt size;
1679: PetscInt i, n;
1681:
1683: PetscObjectGetNewTag((PetscObject)ctx,&tagr);
1684: ctx->destroy = VecScatterDestroy_PtoP;
1686: PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&to->sendfirst);
1687: from->sendfirst = to->sendfirst;
1689: MPI_Comm_size(comm,&size);
1690: /* check if the receives are ALL going into contiguous locations; if so can skip indexing */
1691: to->contiq = PETSC_FALSE;
1692: n = from->starts[from->n];
1693: from->contiq = PETSC_TRUE;
1694: for (i=1; i<n; i++) {
1695: if (from->indices[i] != from->indices[i-1] + bs) {
1696: from->contiq = PETSC_FALSE;
1697: break;
1698: }
1699: }
1701: PetscOptionsHasName(PETSC_NULL,"-vecscatter_alltoall",&to->use_alltoallv);
1702: from->use_alltoallv = to->use_alltoallv;
1703: if (from->use_alltoallv) PetscInfo(0,"Using MPI_Alltoallv() for scatter\n");
1704: #if defined(PETSC_HAVE_MPI_ALLTOALLW)
1705: if (to->use_alltoallv) {
1706: PetscOptionsHasName(PETSC_NULL,"-vecscatter_nopack",&to->use_alltoallw);
1707: }
1708: from->use_alltoallw = to->use_alltoallw;
1709: if (from->use_alltoallw) PetscInfo(0,"Using MPI_Alltoallw() for scatter\n");
1710: #endif
1712: #if defined(PETSC_HAVE_MPI_WIN_CREATE)
1713: PetscOptionsHasName(PETSC_NULL,"-vecscatter_window",&to->use_window);
1714: from->use_window = to->use_window;
1715: #endif
1717: if (to->use_alltoallv) {
1719: PetscMalloc2(size,PetscMPIInt,&to->counts,size,PetscMPIInt,&to->displs);
1720: PetscMemzero(to->counts,size*sizeof(PetscMPIInt));
1721: for (i=0; i<to->n; i++) {
1722: to->counts[to->procs[i]] = bs*(to->starts[i+1] - to->starts[i]);
1723: }
1724: to->displs[0] = 0;
1725: for (i=1; i<size; i++) {
1726: to->displs[i] = to->displs[i-1] + to->counts[i-1];
1727: }
1729: PetscMalloc2(size,PetscMPIInt,&from->counts,size,PetscMPIInt,&from->displs);
1730: PetscMemzero(from->counts,size*sizeof(PetscMPIInt));
1731: for (i=0; i<from->n; i++) {
1732: from->counts[from->procs[i]] = bs*(from->starts[i+1] - from->starts[i]);
1733: }
1734: from->displs[0] = 0;
1735: for (i=1; i<size; i++) {
1736: from->displs[i] = from->displs[i-1] + from->counts[i-1];
1737: }
1738: #if defined(PETSC_HAVE_MPI_ALLTOALLW)
1739: if (to->use_alltoallw) {
1740: ctx->packtogether = PETSC_FALSE;
1741: PetscMalloc3(size,PetscMPIInt,&to->wcounts,size,PetscMPIInt,&to->wdispls,size,MPI_Datatype,&to->types);
1742: PetscMemzero(to->wcounts,size*sizeof(PetscMPIInt));
1743: PetscMemzero(to->wdispls,size*sizeof(PetscMPIInt));
1744: for (i=0; i<size; i++) {
1745: to->types[i] = MPIU_SCALAR;
1746: }
1748: for (i=0; i<to->n; i++) {
1749: to->wcounts[to->procs[i]] = 1;
1750: MPI_Type_create_indexed_block(to->starts[i+1]-to->starts[i],bs,to->indices+to->starts[i],MPIU_SCALAR,to->types+to->procs[i]);
1751: MPI_Type_commit(to->types+to->procs[i]);
1752: }
1753: PetscMalloc3(size,PetscMPIInt,&from->wcounts,size,PetscMPIInt,&from->wdispls,size,MPI_Datatype,&from->types);
1754: PetscMemzero(from->wcounts,size*sizeof(PetscMPIInt));
1755: PetscMemzero(from->wdispls,size*sizeof(PetscMPIInt));
1756: for (i=0; i<size; i++) {
1757: from->types[i] = MPIU_SCALAR;
1758: }
1759: if (from->contiq) {
1760: PetscInfo(0,"Scattered vector entries are stored contiquously, taking advantage of this with -vecscatter_alltoall\n");
1761: for (i=0; i<from->n; i++) {
1762: from->wcounts[from->procs[i]] = bs*(from->starts[i+1] - from->starts[i]);
1763: }
1764: if (from->n) from->wdispls[from->procs[0]] = sizeof(PetscScalar)*from->indices[0];
1765: for (i=1; i<from->n; i++) {
1766: from->wdispls[from->procs[i]] = from->wdispls[from->procs[i-1]] + sizeof(PetscScalar)*from->wcounts[from->procs[i-1]];
1767: }
1768: } else {
1769: for (i=0; i<from->n; i++) {
1770: from->wcounts[from->procs[i]] = 1;
1771: MPI_Type_create_indexed_block(from->starts[i+1]-from->starts[i],bs,from->indices+from->starts[i],MPIU_SCALAR,from->types+from->procs[i]);
1772: MPI_Type_commit(from->types+from->procs[i]);
1773: }
1774: }
1775: }
1776: #else
1777: to->use_alltoallw = PETSC_FALSE;
1778: from->use_alltoallw = PETSC_FALSE;
1779: #endif
1780: #if defined(PETSC_HAVE_MPI_WIN_CREATE)
1781: } else if (to->use_window) {
1782: PetscMPIInt temptag,winsize;
1783: MPI_Request *request;
1784: MPI_Status *status;
1785:
1786: PetscObjectGetNewTag((PetscObject)ctx,&temptag);
1787: winsize = (to->n ? to->starts[to->n] : 0)*sizeof(PetscScalar);
1788: MPI_Win_create(to->values ? to->values : MPI_BOTTOM,winsize,sizeof(PetscScalar),MPI_INFO_NULL,comm,&to->window);
1789: PetscMalloc(to->n,&to->winstarts);
1790: PetscMalloc2(to->n,MPI_Request,&request,to->n,MPI_Status,&status);
1791: for (i=0; i<to->n; i++) {
1792: MPI_Irecv(to->winstarts+i,1,MPIU_INT,to->procs[i],temptag,comm,request+i);
1793: }
1794: for (i=0; i<from->n; i++) {
1795: MPI_Send(from->starts+i,1,MPIU_INT,from->procs[i],temptag,comm);
1796: }
1797: MPI_Waitall(to->n,request,status);
1798: PetscFree2(request,status);
1800: winsize = (from->n ? from->starts[from->n] : 0)*sizeof(PetscScalar);
1801: MPI_Win_create(from->values ? from->values : MPI_BOTTOM,winsize,sizeof(PetscScalar),MPI_INFO_NULL,comm,&from->window);
1802: PetscMalloc(from->n,&from->winstarts);
1803: PetscMalloc2(from->n,MPI_Request,&request,from->n,MPI_Status,&status);
1804: for (i=0; i<from->n; i++) {
1805: MPI_Irecv(from->winstarts+i,1,MPIU_INT,from->procs[i],temptag,comm,request+i);
1806: }
1807: for (i=0; i<to->n; i++) {
1808: MPI_Send(to->starts+i,1,MPIU_INT,to->procs[i],temptag,comm);
1809: }
1810: MPI_Waitall(from->n,request,status);
1811: PetscFree2(request,status);
1812: #endif
1813: } else {
1814: PetscTruth use_rsend = PETSC_FALSE, use_ssend = PETSC_FALSE;
1815: PetscInt *sstarts = to->starts, *rstarts = from->starts;
1816: PetscMPIInt *sprocs = to->procs, *rprocs = from->procs;
1817: MPI_Request *swaits = to->requests,*rwaits = from->requests;
1818: MPI_Request *rev_swaits,*rev_rwaits;
1819: PetscScalar *Ssvalues = to->values, *Srvalues = from->values;
1821: /* allocate additional wait variables for the "reverse" scatter */
1822: PetscMalloc(to->n*sizeof(MPI_Request),&rev_rwaits);
1823: PetscMalloc(from->n*sizeof(MPI_Request),&rev_swaits);
1824: to->rev_requests = rev_rwaits;
1825: from->rev_requests = rev_swaits;
1827: /* Register the receives that you will use later (sends for scatter reverse) */
1828: PetscOptionsHasName(PETSC_NULL,"-vecscatter_rsend",&use_rsend);
1829: PetscOptionsHasName(PETSC_NULL,"-vecscatter_ssend",&use_ssend);
1830: if (use_rsend) {
1831: PetscInfo(0,"Using VecScatter ready receiver mode\n");
1832: to->use_readyreceiver = PETSC_TRUE;
1833: from->use_readyreceiver = PETSC_TRUE;
1834: } else {
1835: to->use_readyreceiver = PETSC_FALSE;
1836: from->use_readyreceiver = PETSC_FALSE;
1837: }
1838: if (use_ssend) {
1839: PetscInfo(0,"Using VecScatter Ssend mode\n");
1840: }
1842: for (i=0; i<from->n; i++) {
1843: if (use_rsend) {
1844: MPI_Rsend_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
1845: } else if (use_ssend) {
1846: MPI_Ssend_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
1847: } else {
1848: MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
1849: }
1850: }
1852: for (i=0; i<to->n; i++) {
1853: if (use_rsend) {
1854: MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
1855: } else if (use_ssend) {
1856: MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
1857: } else {
1858: MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
1859: }
1860: }
1861: /* Register receives for scatter and reverse */
1862: for (i=0; i<from->n; i++) {
1863: MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
1864: }
1865: for (i=0; i<to->n; i++) {
1866: MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tagr,comm,rev_rwaits+i);
1867: }
1868: if (use_rsend) {
1869: if (to->n) {MPI_Startall_irecv(to->starts[to->n]*to->bs,to->n,to->rev_requests);}
1870: if (from->n) {MPI_Startall_irecv(from->starts[from->n]*from->bs,from->n,from->requests);}
1871: MPI_Barrier(comm);
1872: }
1874: ctx->copy = VecScatterCopy_PtoP_X;
1875: }
1876: PetscInfo1(0,"Using blocksize %D scatter\n",bs);
1877: switch (bs) {
1878: case 12:
1879: ctx->begin = VecScatterBegin_12;
1880: ctx->end = VecScatterEnd_12;
1881: break;
1882: case 8:
1883: ctx->begin = VecScatterBegin_8;
1884: ctx->end = VecScatterEnd_8;
1885: break;
1886: case 7:
1887: ctx->begin = VecScatterBegin_7;
1888: ctx->end = VecScatterEnd_7;
1889: break;
1890: case 6:
1891: ctx->begin = VecScatterBegin_6;
1892: ctx->end = VecScatterEnd_6;
1893: break;
1894: case 5:
1895: ctx->begin = VecScatterBegin_5;
1896: ctx->end = VecScatterEnd_5;
1897: break;
1898: case 4:
1899: ctx->begin = VecScatterBegin_4;
1900: ctx->end = VecScatterEnd_4;
1901: break;
1902: case 3:
1903: ctx->begin = VecScatterBegin_3;
1904: ctx->end = VecScatterEnd_3;
1905: break;
1906: case 2:
1907: ctx->begin = VecScatterBegin_2;
1908: ctx->end = VecScatterEnd_2;
1909: break;
1910: case 1:
1911: ctx->begin = VecScatterBegin_1;
1912: ctx->end = VecScatterEnd_1;
1913: break;
1914: default:
1915: SETERRQ(PETSC_ERR_SUP,"Blocksize not supported");
1916: }
1917: ctx->view = VecScatterView_MPI;
1919: /* Check if the local scatter is actually a copy; important special case */
1920: if (to->local.n) {
1921: VecScatterLocalOptimizeCopy_Private(&to->local,&from->local,bs);
1922: }
1923: return(0);
1924: }
1928: /* ------------------------------------------------------------------------------------*/
1929: /*
1930: Scatter from local Seq vectors to a parallel vector.
1931: Reverses the order of the arguments, calls VecScatterCreate_PtoS() then
1932: reverses the result.
1933: */
1936: PetscErrorCode VecScatterCreate_StoP(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
1937: {
1938: PetscErrorCode ierr;
1939: MPI_Request *waits;
1940: VecScatter_MPI_General *to,*from;
1943: VecScatterCreate_PtoS(ny,inidy,nx,inidx,yin,xin,bs,ctx);
1944: to = (VecScatter_MPI_General*)ctx->fromdata;
1945: from = (VecScatter_MPI_General*)ctx->todata;
1946: ctx->todata = (void*)to;
1947: ctx->fromdata = (void*)from;
1948: /* these two are special, they are ALWAYS stored in to struct */
1949: to->sstatus = from->sstatus;
1950: to->rstatus = from->rstatus;
1952: from->sstatus = 0;
1953: from->rstatus = 0;
1955: waits = from->rev_requests;
1956: from->rev_requests = from->requests;
1957: from->requests = waits;
1958: waits = to->rev_requests;
1959: to->rev_requests = to->requests;
1960: to->requests = waits;
1961: return(0);
1962: }
1964: /* ---------------------------------------------------------------------------------*/
1967: PetscErrorCode VecScatterCreate_PtoP(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,VecScatter ctx)
1968: {
1970: PetscMPIInt size,rank,tag,imdex,n;
1971: PetscInt *lens = PETSC_NULL,*owners = xin->map.range;
1972: PetscInt *nprocs = PETSC_NULL,i,j,idx,nsends,nrecvs,*local_inidx = PETSC_NULL,*local_inidy = PETSC_NULL;
1973: PetscInt *owner = PETSC_NULL,*starts = PETSC_NULL,count,slen;
1974: PetscInt *rvalues = PETSC_NULL,*svalues = PETSC_NULL,base,nmax,*values = PETSC_NULL,lastidx;
1975: MPI_Comm comm;
1976: MPI_Request *send_waits = PETSC_NULL,*recv_waits = PETSC_NULL;
1977: MPI_Status recv_status,*send_status = PETSC_NULL;
1978: PetscTruth duplicate = PETSC_FALSE;
1979: #if defined(PETSC_USE_DEBUG)
1980: PetscTruth found = PETSC_FALSE;
1981: #endif
1984: PetscObjectGetNewTag((PetscObject)ctx,&tag);
1985: PetscObjectGetComm((PetscObject)xin,&comm);
1986: MPI_Comm_size(comm,&size);
1987: MPI_Comm_rank(comm,&rank);
1988: if (size == 1) {
1989: VecScatterCreate_StoP(nx,inidx,ny,inidy,xin,yin,1,ctx);
1990: return(0);
1991: }
1993: /*
1994: Each processor ships off its inidx[j] and inidy[j] to the appropriate processor
1995: They then call the StoPScatterCreate()
1996: */
1997: /* first count number of contributors to each processor */
1998: PetscMalloc3(2*size,PetscInt,&nprocs,nx,PetscInt,&owner,(size+1),PetscInt,&starts);
1999: PetscMemzero(nprocs,2*size*sizeof(PetscInt));
2000: lastidx = -1;
2001: j = 0;
2002: for (i=0; i<nx; i++) {
2003: /* if indices are NOT locally sorted, need to start search at the beginning */
2004: if (lastidx > (idx = inidx[i])) j = 0;
2005: lastidx = idx;
2006: for (; j<size; j++) {
2007: if (idx >= owners[j] && idx < owners[j+1]) {
2008: nprocs[2*j]++;
2009: nprocs[2*j+1] = 1;
2010: owner[i] = j;
2011: #if defined(PETSC_USE_DEBUG)
2012: found = PETSC_TRUE;
2013: #endif
2014: break;
2015: }
2016: }
2017: #if defined(PETSC_USE_DEBUG)
2018: if (!found) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"Index %D out of range",idx);
2019: found = PETSC_FALSE;
2020: #endif
2021: }
2022: nsends = 0; for (i=0; i<size; i++) { nsends += nprocs[2*i+1];}
2024: /* inform other processors of number of messages and max length*/
2025: PetscMaxSum(comm,nprocs,&nmax,&nrecvs);
2027: /* post receives: */
2028: PetscMalloc6(2*nrecvs*nmax,PetscInt,&rvalues,2*nx,PetscInt,&svalues,2*nrecvs,PetscInt,&lens,nrecvs,MPI_Request,&recv_waits,nsends,MPI_Request,&send_waits,nsends,MPI_Status,&send_status);
2030: for (i=0; i<nrecvs; i++) {
2031: MPI_Irecv(rvalues+2*nmax*i,2*nmax,MPIU_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
2032: }
2034: /* do sends:
2035: 1) starts[i] gives the starting index in svalues for stuff going to
2036: the ith processor
2037: */
2038: starts[0]= 0;
2039: for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
2040: for (i=0; i<nx; i++) {
2041: svalues[2*starts[owner[i]]] = inidx[i];
2042: svalues[1 + 2*starts[owner[i]]++] = inidy[i];
2043: }
2045: starts[0] = 0;
2046: for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
2047: count = 0;
2048: for (i=0; i<size; i++) {
2049: if (nprocs[2*i+1]) {
2050: MPI_Isend(svalues+2*starts[i],2*nprocs[2*i],MPIU_INT,i,tag,comm,send_waits+count);
2051: count++;
2052: }
2053: }
2054: PetscFree3(nprocs,owner,starts);
2056: /* wait on receives */
2057: count = nrecvs;
2058: slen = 0;
2059: while (count) {
2060: MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
2061: /* unpack receives into our local space */
2062: MPI_Get_count(&recv_status,MPIU_INT,&n);
2063: lens[imdex] = n/2;
2064: slen += n/2;
2065: count--;
2066: }
2067:
2068: PetscMalloc2(slen,PetscInt,&local_inidx,slen,PetscInt,&local_inidy);
2069: base = owners[rank];
2070: count = 0;
2071: for (i=0; i<nrecvs; i++) {
2072: values = rvalues + 2*i*nmax;
2073: for (j=0; j<lens[i]; j++) {
2074: local_inidx[count] = values[2*j] - base;
2075: local_inidy[count++] = values[2*j+1];
2076: }
2077: }
2079: /* wait on sends */
2080: if (nsends) {MPI_Waitall(nsends,send_waits,send_status);}
2081: PetscFree6(rvalues,svalues,lens,recv_waits,send_waits,send_status);
2083: /*
2084: should sort and remove duplicates from local_inidx,local_inidy
2085: */
2087: #if defined(do_it_slow)
2088: /* sort on the from index */
2089: PetscSortIntWithArray(slen,local_inidx,local_inidy);
2090: start = 0;
2091: while (start < slen) {
2092: count = start+1;
2093: last = local_inidx[start];
2094: while (count < slen && last == local_inidx[count]) count++;
2095: if (count > start + 1) { /* found 2 or more same local_inidx[] in a row */
2096: /* sort on to index */
2097: PetscSortInt(count-start,local_inidy+start);
2098: }
2099: /* remove duplicates; not most efficient way, but probably good enough */
2100: i = start;
2101: while (i < count-1) {
2102: if (local_inidy[i] != local_inidy[i+1]) {
2103: i++;
2104: } else { /* found a duplicate */
2105: duplicate = PETSC_TRUE;
2106: for (j=i; j<slen-1; j++) {
2107: local_inidx[j] = local_inidx[j+1];
2108: local_inidy[j] = local_inidy[j+1];
2109: }
2110: slen--;
2111: count--;
2112: }
2113: }
2114: start = count;
2115: }
2116: #endif
2117: if (duplicate) {
2118: PetscInfo(0,"Duplicate from to indices passed in VecScatterCreate(), they are ignored\n");
2119: }
2120: VecScatterCreate_StoP(slen,local_inidx,slen,local_inidy,xin,yin,1,ctx);
2121: PetscFree2(local_inidx,local_inidy);
2122: return(0);
2123: }