Line data Source code
1 : /*! \file
2 : Copyright (c) 2003, The Regents of the University of California, through
3 : Lawrence Berkeley National Laboratory (subject to receipt of any required
4 : approvals from U.S. Dept. of Energy)
5 :
6 : All rights reserved.
7 :
8 : The source code is distributed under BSD license, see the file License.txt
9 : at the top-level directory.
10 : */
11 :
12 : /*! @file dpanel_bmod.c
13 : * \brief Performs numeric block updates
14 : *
15 : * <pre>
16 : * -- SuperLU routine (version 3.0) --
17 : * Univ. of California Berkeley, Xerox Palo Alto Research Center,
18 : * and Lawrence Berkeley National Lab.
19 : * October 15, 2003
20 : *
21 : * Copyright (c) 1994 by Xerox Corporation. All rights reserved.
22 : *
23 : * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
24 : * EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK.
25 : *
26 : * Permission is hereby granted to use or copy this program for any
27 : * purpose, provided the above notices are retained on all copies.
28 : * Permission to modify the code and to distribute modified code is
29 : * granted, provided the above notices are retained, and a notice that
30 : * the code was modified is included with the above copyright notice.
31 : * </pre>
32 : */
33 : /*
34 :
35 : */
36 :
37 : #include <stdio.h>
38 : #include <stdlib.h>
39 : #include "slu_ddefs.h"
40 :
41 : /*! \brief
42 : *
43 : * <pre>
44 : * Purpose
45 : * =======
46 : *
47 : * Performs numeric block updates (sup-panel) in topological order.
48 : * It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
49 : * Special processing on the supernodal portion of L\U[*,j]
50 : *
51 : * Before entering this routine, the original nonzeros in the panel
52 : * were already copied into the spa[m,w].
53 : *
54 : * Updated/Output parameters-
55 : * dense[0:m-1,w]: L[*,j:j+w-1] and U[*,j:j+w-1] are returned
56 : * collectively in the m-by-w vector dense[*].
57 : * </pre>
58 : */
59 :
60 : void
61 0 : dpanel_bmod (
62 : const int m, /* in - number of rows in the matrix */
63 : const int w, /* in */
64 : const int jcol, /* in */
65 : const int nseg, /* in */
66 : double *dense, /* out, of size n by w */
67 : double *tempv, /* working array */
68 : int *segrep, /* in */
69 : int *repfnz, /* in, of size n by w */
70 : GlobalLU_t *Glu, /* modified */
71 : SuperLUStat_t *stat /* output */
72 : )
73 : {
74 :
75 :
76 : #ifdef USE_VENDOR_BLAS
77 : #ifdef _CRAY
78 : _fcd ftcs1 = _cptofcd("L", strlen("L")),
79 : ftcs2 = _cptofcd("N", strlen("N")),
80 : ftcs3 = _cptofcd("U", strlen("U"));
81 : #endif
82 : int incx = 1, incy = 1;
83 : double alpha, beta;
84 : #endif
85 :
86 : register int k, ksub;
87 : int fsupc, nsupc, nsupr, nrow;
88 : int krep, krep_ind;
89 : double ukj, ukj1, ukj2;
90 : int_t luptr, luptr1, luptr2;
91 : int segsze;
92 : int block_nrow; /* no of rows in a block row */
93 : int_t lptr; /* Points to the row subscripts of a supernode */
94 : int kfnz, irow, no_zeros;
95 : register int isub, isub1, i;
96 : register int jj; /* Index through each column in the panel */
97 : int *xsup, *supno;
98 : int_t *lsub, *xlsub;
99 : double *lusup;
100 : int_t *xlusup;
101 : int *repfnz_col; /* repfnz[] for a column in the panel */
102 : double *dense_col; /* dense[] for a column in the panel */
103 : double *tempv1; /* Used in 1-D update */
104 : double *TriTmp, *MatvecTmp; /* used in 2-D update */
105 : double zero = 0.0;
106 : double one = 1.0;
107 : register int ldaTmp;
108 : register int r_ind, r_hi;
109 : int maxsuper, rowblk, colblk;
110 0 : flops_t *ops = stat->ops;
111 :
112 0 : xsup = Glu->xsup;
113 0 : supno = Glu->supno;
114 0 : lsub = Glu->lsub;
115 0 : xlsub = Glu->xlsub;
116 0 : lusup = (double *) Glu->lusup;
117 0 : xlusup = Glu->xlusup;
118 :
119 0 : maxsuper = SUPERLU_MAX( sp_ienv(3), sp_ienv(7) );
120 0 : rowblk = sp_ienv(4);
121 0 : colblk = sp_ienv(5);
122 0 : ldaTmp = maxsuper + rowblk;
123 :
124 : /*
125 : * For each nonz supernode segment of U[*,j] in topological order
126 : */
127 0 : k = nseg - 1;
128 0 : for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */
129 :
130 : /* krep = representative of current k-th supernode
131 : * fsupc = first supernodal column
132 : * nsupc = no of columns in a supernode
133 : * nsupr = no of rows in a supernode
134 : */
135 0 : krep = segrep[k--];
136 0 : fsupc = xsup[supno[krep]];
137 0 : nsupc = krep - fsupc + 1;
138 0 : nsupr = xlsub[fsupc+1] - xlsub[fsupc];
139 0 : nrow = nsupr - nsupc;
140 : lptr = xlsub[fsupc];
141 0 : krep_ind = lptr + nsupc - 1;
142 :
143 : repfnz_col = repfnz;
144 : dense_col = dense;
145 :
146 0 : if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */
147 :
148 : TriTmp = tempv;
149 :
150 : /* Sequence through each column in panel -- triangular solves */
151 0 : for (jj = jcol; jj < jcol + w; jj++,
152 0 : repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) {
153 :
154 0 : kfnz = repfnz_col[krep];
155 0 : if ( kfnz == SLU_EMPTY ) continue; /* Skip any zero segment */
156 :
157 0 : segsze = krep - kfnz + 1;
158 0 : luptr = xlusup[fsupc];
159 :
160 0 : ops[TRSV] += segsze * (segsze - 1);
161 0 : ops[GEMV] += 2 * nrow * segsze;
162 :
163 : /* Case 1: Update U-segment of size 1 -- col-col update */
164 0 : if ( segsze == 1 ) {
165 0 : ukj = dense_col[lsub[krep_ind]];
166 0 : luptr += nsupr*(nsupc-1) + nsupc;
167 :
168 0 : for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
169 0 : irow = lsub[i];
170 0 : dense_col[irow] -= ukj * lusup[luptr];
171 0 : ++luptr;
172 : }
173 :
174 0 : } else if ( segsze <= 3 ) {
175 0 : ukj = dense_col[lsub[krep_ind]];
176 0 : ukj1 = dense_col[lsub[krep_ind - 1]];
177 0 : luptr += nsupr*(nsupc-1) + nsupc-1;
178 0 : luptr1 = luptr - nsupr;
179 :
180 0 : if ( segsze == 2 ) {
181 0 : ukj -= ukj1 * lusup[luptr1];
182 0 : dense_col[lsub[krep_ind]] = ukj;
183 0 : for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
184 0 : irow = lsub[i];
185 0 : luptr++; luptr1++;
186 0 : dense_col[irow] -= (ukj*lusup[luptr]
187 0 : + ukj1*lusup[luptr1]);
188 : }
189 : } else {
190 0 : ukj2 = dense_col[lsub[krep_ind - 2]];
191 0 : luptr2 = luptr1 - nsupr;
192 0 : ukj1 -= ukj2 * lusup[luptr2-1];
193 0 : ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
194 0 : dense_col[lsub[krep_ind]] = ukj;
195 0 : dense_col[lsub[krep_ind-1]] = ukj1;
196 0 : for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
197 0 : irow = lsub[i];
198 0 : luptr++; luptr1++; luptr2++;
199 0 : dense_col[irow] -= ( ukj*lusup[luptr]
200 0 : + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] );
201 : }
202 : }
203 :
204 : } else { /* segsze >= 4 */
205 :
206 : /* Copy U[*,j] segment from dense[*] to TriTmp[*], which
207 : holds the result of triangular solves. */
208 0 : no_zeros = kfnz - fsupc;
209 0 : isub = lptr + no_zeros;
210 0 : for (i = 0; i < segsze; ++i) {
211 0 : irow = lsub[isub];
212 0 : TriTmp[i] = dense_col[irow]; /* Gather */
213 0 : ++isub;
214 : }
215 :
216 : /* start effective triangle */
217 0 : luptr += nsupr * no_zeros + no_zeros;
218 :
219 : #ifdef USE_VENDOR_BLAS
220 : #ifdef _CRAY
221 : STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr],
222 : &nsupr, TriTmp, &incx );
223 : #else
224 : dtrsv_( "L", "N", "U", &segsze, &lusup[luptr],
225 : &nsupr, TriTmp, &incx );
226 : #endif
227 : #else
228 0 : dlsolve ( nsupr, segsze, &lusup[luptr], TriTmp );
229 : #endif
230 :
231 :
232 : } /* else ... */
233 :
234 : } /* for jj ... end tri-solves */
235 :
236 : /* Block row updates; push all the way into dense[*] block */
237 0 : for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) {
238 :
239 0 : r_hi = SUPERLU_MIN(nrow, r_ind + rowblk);
240 0 : block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind);
241 0 : luptr = xlusup[fsupc] + nsupc + r_ind;
242 0 : isub1 = lptr + nsupc + r_ind;
243 :
244 : repfnz_col = repfnz;
245 : TriTmp = tempv;
246 : dense_col = dense;
247 :
248 : /* Sequence through each column in panel -- matrix-vector */
249 0 : for (jj = jcol; jj < jcol + w; jj++,
250 0 : repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
251 :
252 0 : kfnz = repfnz_col[krep];
253 0 : if ( kfnz == SLU_EMPTY ) continue; /* Skip any zero segment */
254 :
255 0 : segsze = krep - kfnz + 1;
256 0 : if ( segsze <= 3 ) continue; /* skip unrolled cases */
257 :
258 : /* Perform a block update, and scatter the result of
259 : matrix-vector to dense[]. */
260 0 : no_zeros = kfnz - fsupc;
261 0 : luptr1 = luptr + nsupr * no_zeros;
262 0 : MatvecTmp = &TriTmp[maxsuper];
263 :
264 : #ifdef USE_VENDOR_BLAS
265 : alpha = one;
266 : beta = zero;
267 : #ifdef _CRAY
268 : SGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1],
269 : &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
270 : #else
271 : dgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1],
272 : &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
273 : #endif
274 : #else
275 0 : dmatvec(nsupr, block_nrow, segsze, &lusup[luptr1],
276 : TriTmp, MatvecTmp);
277 : #endif
278 :
279 : /* Scatter MatvecTmp[*] into SPA dense[*] temporarily
280 : * such that MatvecTmp[*] can be re-used for the
281 : * the next blok row update. dense[] will be copied into
282 : * global store after the whole panel has been finished.
283 : */
284 : isub = isub1;
285 0 : for (i = 0; i < block_nrow; i++) {
286 0 : irow = lsub[isub];
287 0 : dense_col[irow] -= MatvecTmp[i];
288 0 : MatvecTmp[i] = zero;
289 0 : ++isub;
290 : }
291 :
292 : } /* for jj ... */
293 :
294 : } /* for each block row ... */
295 :
296 : /* Scatter the triangular solves into SPA dense[*] */
297 : repfnz_col = repfnz;
298 : TriTmp = tempv;
299 : dense_col = dense;
300 :
301 0 : for (jj = jcol; jj < jcol + w; jj++,
302 0 : repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
303 0 : kfnz = repfnz_col[krep];
304 0 : if ( kfnz == SLU_EMPTY ) continue; /* Skip any zero segment */
305 :
306 0 : segsze = krep - kfnz + 1;
307 0 : if ( segsze <= 3 ) continue; /* skip unrolled cases */
308 :
309 0 : no_zeros = kfnz - fsupc;
310 0 : isub = lptr + no_zeros;
311 0 : for (i = 0; i < segsze; i++) {
312 0 : irow = lsub[isub];
313 0 : dense_col[irow] = TriTmp[i];
314 0 : TriTmp[i] = zero;
315 0 : ++isub;
316 : }
317 :
318 : } /* for jj ... */
319 :
320 : } else { /* 1-D block modification */
321 :
322 :
323 : /* Sequence through each column in the panel */
324 0 : for (jj = jcol; jj < jcol + w; jj++,
325 0 : repfnz_col += m, dense_col += m) {
326 :
327 0 : kfnz = repfnz_col[krep];
328 0 : if ( kfnz == SLU_EMPTY ) continue; /* Skip any zero segment */
329 :
330 0 : segsze = krep - kfnz + 1;
331 0 : luptr = xlusup[fsupc];
332 :
333 0 : ops[TRSV] += segsze * (segsze - 1);
334 0 : ops[GEMV] += 2 * nrow * segsze;
335 :
336 : /* Case 1: Update U-segment of size 1 -- col-col update */
337 0 : if ( segsze == 1 ) {
338 0 : ukj = dense_col[lsub[krep_ind]];
339 0 : luptr += nsupr*(nsupc-1) + nsupc;
340 :
341 0 : for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
342 0 : irow = lsub[i];
343 0 : dense_col[irow] -= ukj * lusup[luptr];
344 0 : ++luptr;
345 : }
346 :
347 0 : } else if ( segsze <= 3 ) {
348 0 : ukj = dense_col[lsub[krep_ind]];
349 0 : luptr += nsupr*(nsupc-1) + nsupc-1;
350 0 : ukj1 = dense_col[lsub[krep_ind - 1]];
351 0 : luptr1 = luptr - nsupr;
352 :
353 0 : if ( segsze == 2 ) {
354 0 : ukj -= ukj1 * lusup[luptr1];
355 0 : dense_col[lsub[krep_ind]] = ukj;
356 0 : for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
357 0 : irow = lsub[i];
358 0 : ++luptr; ++luptr1;
359 0 : dense_col[irow] -= (ukj*lusup[luptr]
360 0 : + ukj1*lusup[luptr1]);
361 : }
362 : } else {
363 0 : ukj2 = dense_col[lsub[krep_ind - 2]];
364 0 : luptr2 = luptr1 - nsupr;
365 0 : ukj1 -= ukj2 * lusup[luptr2-1];
366 0 : ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
367 0 : dense_col[lsub[krep_ind]] = ukj;
368 0 : dense_col[lsub[krep_ind-1]] = ukj1;
369 0 : for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
370 0 : irow = lsub[i];
371 0 : ++luptr; ++luptr1; ++luptr2;
372 0 : dense_col[irow] -= ( ukj*lusup[luptr]
373 0 : + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] );
374 : }
375 : }
376 :
377 : } else { /* segsze >= 4 */
378 : /*
379 : * Perform a triangular solve and block update,
380 : * then scatter the result of sup-col update to dense[].
381 : */
382 0 : no_zeros = kfnz - fsupc;
383 :
384 : /* Copy U[*,j] segment from dense[*] to tempv[*]:
385 : * The result of triangular solve is in tempv[*];
386 : * The result of matrix vector update is in dense_col[*]
387 : */
388 0 : isub = lptr + no_zeros;
389 0 : for (i = 0; i < segsze; ++i) {
390 0 : irow = lsub[isub];
391 0 : tempv[i] = dense_col[irow]; /* Gather */
392 0 : ++isub;
393 : }
394 :
395 : /* start effective triangle */
396 0 : luptr += nsupr * no_zeros + no_zeros;
397 :
398 : #ifdef USE_VENDOR_BLAS
399 : #ifdef _CRAY
400 : STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr],
401 : &nsupr, tempv, &incx );
402 : #else
403 : dtrsv_( "L", "N", "U", &segsze, &lusup[luptr],
404 : &nsupr, tempv, &incx );
405 : #endif
406 :
407 : luptr += segsze; /* Dense matrix-vector */
408 : tempv1 = &tempv[segsze];
409 : alpha = one;
410 : beta = zero;
411 : #ifdef _CRAY
412 : SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr],
413 : &nsupr, tempv, &incx, &beta, tempv1, &incy );
414 : #else
415 : dgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr],
416 : &nsupr, tempv, &incx, &beta, tempv1, &incy );
417 : #endif
418 : #else
419 0 : dlsolve ( nsupr, segsze, &lusup[luptr], tempv );
420 :
421 0 : luptr += segsze; /* Dense matrix-vector */
422 0 : tempv1 = &tempv[segsze];
423 0 : dmatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1);
424 : #endif
425 :
426 : /* Scatter tempv[*] into SPA dense[*] temporarily, such
427 : * that tempv[*] can be used for the triangular solve of
428 : * the next column of the panel. They will be copied into
429 : * ucol[*] after the whole panel has been finished.
430 : */
431 : isub = lptr + no_zeros;
432 0 : for (i = 0; i < segsze; i++) {
433 0 : irow = lsub[isub];
434 0 : dense_col[irow] = tempv[i];
435 0 : tempv[i] = zero;
436 0 : isub++;
437 : }
438 :
439 : /* Scatter the update from tempv1[*] into SPA dense[*] */
440 : /* Start dense rectangular L */
441 0 : for (i = 0; i < nrow; i++) {
442 0 : irow = lsub[isub];
443 0 : dense_col[irow] -= tempv1[i];
444 0 : tempv1[i] = zero;
445 0 : ++isub;
446 : }
447 :
448 : } /* else segsze>=4 ... */
449 :
450 : } /* for each column in the panel... */
451 :
452 : } /* else 1-D update ... */
453 :
454 : } /* for each updating supernode ... */
455 :
456 0 : }
457 :
|