Actual source code: baijsolv.c
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <petsc/private/kernels/blockinvert.h>
4: PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
5: {
6: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
7: IS iscol=a->col,isrow=a->row;
8: const PetscInt *r,*c,*rout,*cout;
9: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi;
10: PetscInt i,nz;
11: const PetscInt bs =A->rmap->bs,bs2=a->bs2;
12: const MatScalar *aa=a->a,*v;
13: PetscScalar *x,*s,*t,*ls;
14: const PetscScalar *b;
16: VecGetArrayRead(bb,&b);
17: VecGetArray(xx,&x);
18: t = a->solve_work;
20: ISGetIndices(isrow,&rout); r = rout;
21: ISGetIndices(iscol,&cout)); c = cout + (n-1;
23: /* forward solve the lower triangular */
24: PetscArraycpy(t,b+bs*(*r++),bs);
25: for (i=1; i<n; i++) {
26: v = aa + bs2*ai[i];
27: vi = aj + ai[i];
28: nz = a->diag[i] - ai[i];
29: s = t + bs*i;
30: PetscArraycpy(s,b+bs*(*r++),bs);
31: while (nz--) {
32: PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
33: v += bs2;
34: }
35: }
36: /* backward solve the upper triangular */
37: ls = a->solve_work + A->cmap->n;
38: for (i=n-1; i>=0; i--) {
39: v = aa + bs2*(a->diag[i] + 1);
40: vi = aj + a->diag[i] + 1;
41: nz = ai[i+1] - a->diag[i] - 1;
42: PetscArraycpy(ls,t+i*bs,bs);
43: while (nz--) {
44: PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
45: v += bs2;
46: }
47: PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
48: PetscArraycpy(x + bs*(*c--),t+i*bs,bs);
49: }
51: ISRestoreIndices(isrow,&rout);
52: ISRestoreIndices(iscol,&cout);
53: VecRestoreArrayRead(bb,&b);
54: VecRestoreArray(xx,&x);
55: PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
56: return 0;
57: }
59: PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
60: {
61: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
62: IS iscol=a->col,isrow=a->row;
63: const PetscInt *r,*c,*ai=a->i,*aj=a->j;
64: const PetscInt *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
65: PetscInt i,nz,idx,idt,idc;
66: const MatScalar *aa=a->a,*v;
67: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
68: const PetscScalar *b;
70: VecGetArrayRead(bb,&b);
71: VecGetArray(xx,&x);
72: t = a->solve_work;
74: ISGetIndices(isrow,&rout); r = rout;
75: ISGetIndices(iscol,&cout)); c = cout + (n-1;
77: /* forward solve the lower triangular */
78: idx = 7*(*r++);
79: t[0] = b[idx]; t[1] = b[1+idx];
80: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
81: t[5] = b[5+idx]; t[6] = b[6+idx];
83: for (i=1; i<n; i++) {
84: v = aa + 49*ai[i];
85: vi = aj + ai[i];
86: nz = diag[i] - ai[i];
87: idx = 7*(*r++);
88: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
89: s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
90: while (nz--) {
91: idx = 7*(*vi++);
92: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
93: x4 = t[3+idx];x5 = t[4+idx];
94: x6 = t[5+idx];x7 = t[6+idx];
95: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
96: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
97: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
98: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
99: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
100: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
101: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
102: v += 49;
103: }
104: idx = 7*i;
105: t[idx] = s1;t[1+idx] = s2;
106: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
107: t[5+idx] = s6;t[6+idx] = s7;
108: }
109: /* backward solve the upper triangular */
110: for (i=n-1; i>=0; i--) {
111: v = aa + 49*diag[i] + 49;
112: vi = aj + diag[i] + 1;
113: nz = ai[i+1] - diag[i] - 1;
114: idt = 7*i;
115: s1 = t[idt]; s2 = t[1+idt];
116: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
117: s6 = t[5+idt];s7 = t[6+idt];
118: while (nz--) {
119: idx = 7*(*vi++);
120: x1 = t[idx]; x2 = t[1+idx];
121: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
122: x6 = t[5+idx]; x7 = t[6+idx];
123: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
124: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
125: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
126: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
127: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
128: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
129: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
130: v += 49;
131: }
132: idc = 7*(*c--);
133: v = aa + 49*diag[i];
134: x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+
135: v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
136: x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
137: v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
138: x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
139: v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
140: x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
141: v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
142: x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
143: v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
144: x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
145: v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
146: x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
147: v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
148: }
150: ISRestoreIndices(isrow,&rout);
151: ISRestoreIndices(iscol,&cout);
152: VecRestoreArrayRead(bb,&b);
153: VecRestoreArray(xx,&x);
154: PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
155: return 0;
156: }
158: PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
159: {
160: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
161: IS iscol=a->col,isrow=a->row;
162: const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
163: const PetscInt n=a->mbs,*rout,*cout,*vi;
164: PetscInt i,nz,idx,idt,idc,m;
165: const MatScalar *aa=a->a,*v;
166: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
167: const PetscScalar *b;
169: VecGetArrayRead(bb,&b);
170: VecGetArray(xx,&x);
171: t = a->solve_work;
173: ISGetIndices(isrow,&rout); r = rout;
174: ISGetIndices(iscol,&cout); c = cout;
176: /* forward solve the lower triangular */
177: idx = 7*r[0];
178: t[0] = b[idx]; t[1] = b[1+idx];
179: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
180: t[5] = b[5+idx]; t[6] = b[6+idx];
182: for (i=1; i<n; i++) {
183: v = aa + 49*ai[i];
184: vi = aj + ai[i];
185: nz = ai[i+1] - ai[i];
186: idx = 7*r[i];
187: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
188: s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
189: for (m=0; m<nz; m++) {
190: idx = 7*vi[m];
191: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
192: x4 = t[3+idx];x5 = t[4+idx];
193: x6 = t[5+idx];x7 = t[6+idx];
194: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
195: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
196: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
197: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
198: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
199: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
200: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
201: v += 49;
202: }
203: idx = 7*i;
204: t[idx] = s1;t[1+idx] = s2;
205: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
206: t[5+idx] = s6;t[6+idx] = s7;
207: }
208: /* backward solve the upper triangular */
209: for (i=n-1; i>=0; i--) {
210: v = aa + 49*(adiag[i+1]+1);
211: vi = aj + adiag[i+1]+1;
212: nz = adiag[i] - adiag[i+1] - 1;
213: idt = 7*i;
214: s1 = t[idt]; s2 = t[1+idt];
215: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
216: s6 = t[5+idt];s7 = t[6+idt];
217: for (m=0; m<nz; m++) {
218: idx = 7*vi[m];
219: x1 = t[idx]; x2 = t[1+idx];
220: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
221: x6 = t[5+idx]; x7 = t[6+idx];
222: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
223: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
224: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
225: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
226: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
227: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
228: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
229: v += 49;
230: }
231: idc = 7*c[i];
232: x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+
233: v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
234: x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
235: v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
236: x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
237: v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
238: x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
239: v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
240: x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
241: v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
242: x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
243: v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
244: x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
245: v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
246: }
248: ISRestoreIndices(isrow,&rout);
249: ISRestoreIndices(iscol,&cout);
250: VecRestoreArrayRead(bb,&b);
251: VecRestoreArray(xx,&x);
252: PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
253: return 0;
254: }
256: PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
257: {
258: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
259: IS iscol=a->col,isrow=a->row;
260: const PetscInt *r,*c,*rout,*cout;
261: const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
262: PetscInt i,nz,idx,idt,idc;
263: const MatScalar *aa=a->a,*v;
264: PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
265: const PetscScalar *b;
267: VecGetArrayRead(bb,&b);
268: VecGetArray(xx,&x);
269: t = a->solve_work;
271: ISGetIndices(isrow,&rout); r = rout;
272: ISGetIndices(iscol,&cout)); c = cout + (n-1;
274: /* forward solve the lower triangular */
275: idx = 6*(*r++);
276: t[0] = b[idx]; t[1] = b[1+idx];
277: t[2] = b[2+idx]; t[3] = b[3+idx];
278: t[4] = b[4+idx]; t[5] = b[5+idx];
279: for (i=1; i<n; i++) {
280: v = aa + 36*ai[i];
281: vi = aj + ai[i];
282: nz = diag[i] - ai[i];
283: idx = 6*(*r++);
284: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
285: s5 = b[4+idx]; s6 = b[5+idx];
286: while (nz--) {
287: idx = 6*(*vi++);
288: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
289: x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
290: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
291: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
292: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
293: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
294: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
295: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
296: v += 36;
297: }
298: idx = 6*i;
299: t[idx] = s1;t[1+idx] = s2;
300: t[2+idx] = s3;t[3+idx] = s4;
301: t[4+idx] = s5;t[5+idx] = s6;
302: }
303: /* backward solve the upper triangular */
304: for (i=n-1; i>=0; i--) {
305: v = aa + 36*diag[i] + 36;
306: vi = aj + diag[i] + 1;
307: nz = ai[i+1] - diag[i] - 1;
308: idt = 6*i;
309: s1 = t[idt]; s2 = t[1+idt];
310: s3 = t[2+idt];s4 = t[3+idt];
311: s5 = t[4+idt];s6 = t[5+idt];
312: while (nz--) {
313: idx = 6*(*vi++);
314: x1 = t[idx]; x2 = t[1+idx];
315: x3 = t[2+idx]; x4 = t[3+idx];
316: x5 = t[4+idx]; x6 = t[5+idx];
317: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
318: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
319: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
320: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
321: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
322: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
323: v += 36;
324: }
325: idc = 6*(*c--);
326: v = aa + 36*diag[i];
327: x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+
328: v[18]*s4+v[24]*s5+v[30]*s6;
329: x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
330: v[19]*s4+v[25]*s5+v[31]*s6;
331: x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
332: v[20]*s4+v[26]*s5+v[32]*s6;
333: x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
334: v[21]*s4+v[27]*s5+v[33]*s6;
335: x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
336: v[22]*s4+v[28]*s5+v[34]*s6;
337: x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
338: v[23]*s4+v[29]*s5+v[35]*s6;
339: }
341: ISRestoreIndices(isrow,&rout);
342: ISRestoreIndices(iscol,&cout);
343: VecRestoreArrayRead(bb,&b);
344: VecRestoreArray(xx,&x);
345: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
346: return 0;
347: }
349: PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
350: {
351: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
352: IS iscol=a->col,isrow=a->row;
353: const PetscInt *r,*c,*rout,*cout;
354: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
355: PetscInt i,nz,idx,idt,idc,m;
356: const MatScalar *aa=a->a,*v;
357: PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
358: const PetscScalar *b;
360: VecGetArrayRead(bb,&b);
361: VecGetArray(xx,&x);
362: t = a->solve_work;
364: ISGetIndices(isrow,&rout); r = rout;
365: ISGetIndices(iscol,&cout); c = cout;
367: /* forward solve the lower triangular */
368: idx = 6*r[0];
369: t[0] = b[idx]; t[1] = b[1+idx];
370: t[2] = b[2+idx]; t[3] = b[3+idx];
371: t[4] = b[4+idx]; t[5] = b[5+idx];
372: for (i=1; i<n; i++) {
373: v = aa + 36*ai[i];
374: vi = aj + ai[i];
375: nz = ai[i+1] - ai[i];
376: idx = 6*r[i];
377: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
378: s5 = b[4+idx]; s6 = b[5+idx];
379: for (m=0; m<nz; m++) {
380: idx = 6*vi[m];
381: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
382: x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
383: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
384: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
385: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
386: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
387: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
388: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
389: v += 36;
390: }
391: idx = 6*i;
392: t[idx] = s1;t[1+idx] = s2;
393: t[2+idx] = s3;t[3+idx] = s4;
394: t[4+idx] = s5;t[5+idx] = s6;
395: }
396: /* backward solve the upper triangular */
397: for (i=n-1; i>=0; i--) {
398: v = aa + 36*(adiag[i+1]+1);
399: vi = aj + adiag[i+1]+1;
400: nz = adiag[i] - adiag[i+1] - 1;
401: idt = 6*i;
402: s1 = t[idt]; s2 = t[1+idt];
403: s3 = t[2+idt];s4 = t[3+idt];
404: s5 = t[4+idt];s6 = t[5+idt];
405: for (m=0; m<nz; m++) {
406: idx = 6*vi[m];
407: x1 = t[idx]; x2 = t[1+idx];
408: x3 = t[2+idx]; x4 = t[3+idx];
409: x5 = t[4+idx]; x6 = t[5+idx];
410: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
411: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
412: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
413: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
414: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
415: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
416: v += 36;
417: }
418: idc = 6*c[i];
419: x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+
420: v[18]*s4+v[24]*s5+v[30]*s6;
421: x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
422: v[19]*s4+v[25]*s5+v[31]*s6;
423: x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
424: v[20]*s4+v[26]*s5+v[32]*s6;
425: x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
426: v[21]*s4+v[27]*s5+v[33]*s6;
427: x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
428: v[22]*s4+v[28]*s5+v[34]*s6;
429: x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
430: v[23]*s4+v[29]*s5+v[35]*s6;
431: }
433: ISRestoreIndices(isrow,&rout);
434: ISRestoreIndices(iscol,&cout);
435: VecRestoreArrayRead(bb,&b);
436: VecRestoreArray(xx,&x);
437: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
438: return 0;
439: }
441: PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
442: {
443: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
444: IS iscol=a->col,isrow=a->row;
445: const PetscInt *r,*c,*rout,*cout,*diag = a->diag;
446: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
447: PetscInt i,nz,idx,idt,idc;
448: const MatScalar *aa=a->a,*v;
449: PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
450: const PetscScalar *b;
452: VecGetArrayRead(bb,&b);
453: VecGetArray(xx,&x);
454: t = a->solve_work;
456: ISGetIndices(isrow,&rout); r = rout;
457: ISGetIndices(iscol,&cout)); c = cout + (n-1;
459: /* forward solve the lower triangular */
460: idx = 5*(*r++);
461: t[0] = b[idx]; t[1] = b[1+idx];
462: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
463: for (i=1; i<n; i++) {
464: v = aa + 25*ai[i];
465: vi = aj + ai[i];
466: nz = diag[i] - ai[i];
467: idx = 5*(*r++);
468: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
469: s5 = b[4+idx];
470: while (nz--) {
471: idx = 5*(*vi++);
472: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
473: x4 = t[3+idx];x5 = t[4+idx];
474: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
475: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
476: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
477: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
478: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
479: v += 25;
480: }
481: idx = 5*i;
482: t[idx] = s1;t[1+idx] = s2;
483: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
484: }
485: /* backward solve the upper triangular */
486: for (i=n-1; i>=0; i--) {
487: v = aa + 25*diag[i] + 25;
488: vi = aj + diag[i] + 1;
489: nz = ai[i+1] - diag[i] - 1;
490: idt = 5*i;
491: s1 = t[idt]; s2 = t[1+idt];
492: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
493: while (nz--) {
494: idx = 5*(*vi++);
495: x1 = t[idx]; x2 = t[1+idx];
496: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
497: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
498: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
499: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
500: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
501: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
502: v += 25;
503: }
504: idc = 5*(*c--);
505: v = aa + 25*diag[i];
506: x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+
507: v[15]*s4+v[20]*s5;
508: x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
509: v[16]*s4+v[21]*s5;
510: x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
511: v[17]*s4+v[22]*s5;
512: x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
513: v[18]*s4+v[23]*s5;
514: x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
515: v[19]*s4+v[24]*s5;
516: }
518: ISRestoreIndices(isrow,&rout);
519: ISRestoreIndices(iscol,&cout);
520: VecRestoreArrayRead(bb,&b);
521: VecRestoreArray(xx,&x);
522: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
523: return 0;
524: }
526: PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
527: {
528: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
529: IS iscol=a->col,isrow=a->row;
530: const PetscInt *r,*c,*rout,*cout;
531: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
532: PetscInt i,nz,idx,idt,idc,m;
533: const MatScalar *aa=a->a,*v;
534: PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
535: const PetscScalar *b;
537: VecGetArrayRead(bb,&b);
538: VecGetArray(xx,&x);
539: t = a->solve_work;
541: ISGetIndices(isrow,&rout); r = rout;
542: ISGetIndices(iscol,&cout); c = cout;
544: /* forward solve the lower triangular */
545: idx = 5*r[0];
546: t[0] = b[idx]; t[1] = b[1+idx];
547: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
548: for (i=1; i<n; i++) {
549: v = aa + 25*ai[i];
550: vi = aj + ai[i];
551: nz = ai[i+1] - ai[i];
552: idx = 5*r[i];
553: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
554: s5 = b[4+idx];
555: for (m=0; m<nz; m++) {
556: idx = 5*vi[m];
557: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
558: x4 = t[3+idx];x5 = t[4+idx];
559: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
560: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
561: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
562: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
563: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
564: v += 25;
565: }
566: idx = 5*i;
567: t[idx] = s1;t[1+idx] = s2;
568: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
569: }
570: /* backward solve the upper triangular */
571: for (i=n-1; i>=0; i--) {
572: v = aa + 25*(adiag[i+1]+1);
573: vi = aj + adiag[i+1]+1;
574: nz = adiag[i] - adiag[i+1] - 1;
575: idt = 5*i;
576: s1 = t[idt]; s2 = t[1+idt];
577: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
578: for (m=0; m<nz; m++) {
579: idx = 5*vi[m];
580: x1 = t[idx]; x2 = t[1+idx];
581: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
582: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
583: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
584: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
585: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
586: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
587: v += 25;
588: }
589: idc = 5*c[i];
590: x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+
591: v[15]*s4+v[20]*s5;
592: x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
593: v[16]*s4+v[21]*s5;
594: x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
595: v[17]*s4+v[22]*s5;
596: x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
597: v[18]*s4+v[23]*s5;
598: x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
599: v[19]*s4+v[24]*s5;
600: }
602: ISRestoreIndices(isrow,&rout);
603: ISRestoreIndices(iscol,&cout);
604: VecRestoreArrayRead(bb,&b);
605: VecRestoreArray(xx,&x);
606: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
607: return 0;
608: }
610: PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
611: {
612: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
613: IS iscol=a->col,isrow=a->row;
614: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
615: PetscInt i,nz,idx,idt,idc;
616: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
617: const MatScalar *aa=a->a,*v;
618: PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
619: const PetscScalar *b;
621: VecGetArrayRead(bb,&b);
622: VecGetArray(xx,&x);
623: t = a->solve_work;
625: ISGetIndices(isrow,&rout); r = rout;
626: ISGetIndices(iscol,&cout)); c = cout + (n-1;
628: /* forward solve the lower triangular */
629: idx = 4*(*r++);
630: t[0] = b[idx]; t[1] = b[1+idx];
631: t[2] = b[2+idx]; t[3] = b[3+idx];
632: for (i=1; i<n; i++) {
633: v = aa + 16*ai[i];
634: vi = aj + ai[i];
635: nz = diag[i] - ai[i];
636: idx = 4*(*r++);
637: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
638: while (nz--) {
639: idx = 4*(*vi++);
640: x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
641: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
642: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
643: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
644: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
645: v += 16;
646: }
647: idx = 4*i;
648: t[idx] = s1;t[1+idx] = s2;
649: t[2+idx] = s3;t[3+idx] = s4;
650: }
651: /* backward solve the upper triangular */
652: for (i=n-1; i>=0; i--) {
653: v = aa + 16*diag[i] + 16;
654: vi = aj + diag[i] + 1;
655: nz = ai[i+1] - diag[i] - 1;
656: idt = 4*i;
657: s1 = t[idt]; s2 = t[1+idt];
658: s3 = t[2+idt];s4 = t[3+idt];
659: while (nz--) {
660: idx = 4*(*vi++);
661: x1 = t[idx]; x2 = t[1+idx];
662: x3 = t[2+idx]; x4 = t[3+idx];
663: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
664: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
665: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
666: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
667: v += 16;
668: }
669: idc = 4*(*c--);
670: v = aa + 16*diag[i];
671: x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
672: x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
673: x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
674: x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
675: }
677: ISRestoreIndices(isrow,&rout);
678: ISRestoreIndices(iscol,&cout);
679: VecRestoreArrayRead(bb,&b);
680: VecRestoreArray(xx,&x);
681: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
682: return 0;
683: }
685: PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
686: {
687: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
688: IS iscol=a->col,isrow=a->row;
689: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
690: PetscInt i,nz,idx,idt,idc,m;
691: const PetscInt *r,*c,*rout,*cout;
692: const MatScalar *aa=a->a,*v;
693: PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
694: const PetscScalar *b;
696: VecGetArrayRead(bb,&b);
697: VecGetArray(xx,&x);
698: t = a->solve_work;
700: ISGetIndices(isrow,&rout); r = rout;
701: ISGetIndices(iscol,&cout); c = cout;
703: /* forward solve the lower triangular */
704: idx = 4*r[0];
705: t[0] = b[idx]; t[1] = b[1+idx];
706: t[2] = b[2+idx]; t[3] = b[3+idx];
707: for (i=1; i<n; i++) {
708: v = aa + 16*ai[i];
709: vi = aj + ai[i];
710: nz = ai[i+1] - ai[i];
711: idx = 4*r[i];
712: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
713: for (m=0; m<nz; m++) {
714: idx = 4*vi[m];
715: x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
716: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
717: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
718: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
719: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
720: v += 16;
721: }
722: idx = 4*i;
723: t[idx] = s1;t[1+idx] = s2;
724: t[2+idx] = s3;t[3+idx] = s4;
725: }
726: /* backward solve the upper triangular */
727: for (i=n-1; i>=0; i--) {
728: v = aa + 16*(adiag[i+1]+1);
729: vi = aj + adiag[i+1]+1;
730: nz = adiag[i] - adiag[i+1] - 1;
731: idt = 4*i;
732: s1 = t[idt]; s2 = t[1+idt];
733: s3 = t[2+idt];s4 = t[3+idt];
734: for (m=0; m<nz; m++) {
735: idx = 4*vi[m];
736: x1 = t[idx]; x2 = t[1+idx];
737: x3 = t[2+idx]; x4 = t[3+idx];
738: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
739: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
740: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
741: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
742: v += 16;
743: }
744: idc = 4*c[i];
745: x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
746: x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
747: x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
748: x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
749: }
751: ISRestoreIndices(isrow,&rout);
752: ISRestoreIndices(iscol,&cout);
753: VecRestoreArrayRead(bb,&b);
754: VecRestoreArray(xx,&x);
755: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
756: return 0;
757: }
759: PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
760: {
761: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
762: IS iscol=a->col,isrow=a->row;
763: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
764: PetscInt i,nz,idx,idt,idc;
765: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
766: const MatScalar *aa=a->a,*v;
767: MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t;
768: PetscScalar *x;
769: const PetscScalar *b;
771: VecGetArrayRead(bb,&b);
772: VecGetArray(xx,&x);
773: t = (MatScalar*)a->solve_work;
775: ISGetIndices(isrow,&rout); r = rout;
776: ISGetIndices(iscol,&cout)); c = cout + (n-1;
778: /* forward solve the lower triangular */
779: idx = 4*(*r++);
780: t[0] = (MatScalar)b[idx];
781: t[1] = (MatScalar)b[1+idx];
782: t[2] = (MatScalar)b[2+idx];
783: t[3] = (MatScalar)b[3+idx];
784: for (i=1; i<n; i++) {
785: v = aa + 16*ai[i];
786: vi = aj + ai[i];
787: nz = diag[i] - ai[i];
788: idx = 4*(*r++);
789: s1 = (MatScalar)b[idx];
790: s2 = (MatScalar)b[1+idx];
791: s3 = (MatScalar)b[2+idx];
792: s4 = (MatScalar)b[3+idx];
793: while (nz--) {
794: idx = 4*(*vi++);
795: x1 = t[idx];
796: x2 = t[1+idx];
797: x3 = t[2+idx];
798: x4 = t[3+idx];
799: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
800: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
801: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
802: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
803: v += 16;
804: }
805: idx = 4*i;
806: t[idx] = s1;
807: t[1+idx] = s2;
808: t[2+idx] = s3;
809: t[3+idx] = s4;
810: }
811: /* backward solve the upper triangular */
812: for (i=n-1; i>=0; i--) {
813: v = aa + 16*diag[i] + 16;
814: vi = aj + diag[i] + 1;
815: nz = ai[i+1] - diag[i] - 1;
816: idt = 4*i;
817: s1 = t[idt];
818: s2 = t[1+idt];
819: s3 = t[2+idt];
820: s4 = t[3+idt];
821: while (nz--) {
822: idx = 4*(*vi++);
823: x1 = t[idx];
824: x2 = t[1+idx];
825: x3 = t[2+idx];
826: x4 = t[3+idx];
827: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
828: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
829: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
830: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
831: v += 16;
832: }
833: idc = 4*(*c--);
834: v = aa + 16*diag[i];
835: t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
836: t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
837: t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
838: t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
839: x[idc] = (PetscScalar)t[idt];
840: x[1+idc] = (PetscScalar)t[1+idt];
841: x[2+idc] = (PetscScalar)t[2+idt];
842: x[3+idc] = (PetscScalar)t[3+idt];
843: }
845: ISRestoreIndices(isrow,&rout);
846: ISRestoreIndices(iscol,&cout);
847: VecRestoreArrayRead(bb,&b);
848: VecRestoreArray(xx,&x);
849: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
850: return 0;
851: }
853: #if defined(PETSC_HAVE_SSE)
855: #include PETSC_HAVE_SSE
857: PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
858: {
859: /*
860: Note: This code uses demotion of double
861: to float when performing the mixed-mode computation.
862: This may not be numerically reasonable for all applications.
863: */
864: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
865: IS iscol=a->col,isrow=a->row;
866: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
867: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
868: MatScalar *aa=a->a,*v;
869: PetscScalar *x,*b,*t;
871: /* Make space in temp stack for 16 Byte Aligned arrays */
872: float ssealignedspace[11],*tmps,*tmpx;
873: unsigned long offset;
875: SSE_SCOPE_BEGIN;
877: offset = (unsigned long)ssealignedspace % 16;
878: if (offset) offset = (16 - offset)/4;
879: tmps = &ssealignedspace[offset];
880: tmpx = &ssealignedspace[offset+4];
881: PREFETCH_NTA(aa+16*ai[1]);
883: VecGetArray(bb,&b);
884: VecGetArray(xx,&x);
885: t = a->solve_work;
887: ISGetIndices(isrow,&rout); r = rout;
888: ISGetIndices(iscol,&cout)); c = cout + (n-1;
890: /* forward solve the lower triangular */
891: idx = 4*(*r++);
892: t[0] = b[idx]; t[1] = b[1+idx];
893: t[2] = b[2+idx]; t[3] = b[3+idx];
894: v = aa + 16*ai[1];
896: for (i=1; i<n;) {
897: PREFETCH_NTA(&v[8]);
898: vi = aj + ai[i];
899: nz = diag[i] - ai[i];
900: idx = 4*(*r++);
902: /* Demote sum from double to float */
903: CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
904: LOAD_PS(tmps,XMM7);
906: while (nz--) {
907: PREFETCH_NTA(&v[16]);
908: idx = 4*(*vi++);
910: /* Demote solution (so far) from double to float */
911: CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
913: /* 4x4 Matrix-Vector product with negative accumulation: */
914: SSE_INLINE_BEGIN_2(tmpx,v)
915: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
917: /* First Column */
918: SSE_COPY_PS(XMM0,XMM6)
919: SSE_SHUFFLE(XMM0,XMM0,0x00)
920: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
921: SSE_SUB_PS(XMM7,XMM0)
923: /* Second Column */
924: SSE_COPY_PS(XMM1,XMM6)
925: SSE_SHUFFLE(XMM1,XMM1,0x55)
926: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
927: SSE_SUB_PS(XMM7,XMM1)
929: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
931: /* Third Column */
932: SSE_COPY_PS(XMM2,XMM6)
933: SSE_SHUFFLE(XMM2,XMM2,0xAA)
934: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
935: SSE_SUB_PS(XMM7,XMM2)
937: /* Fourth Column */
938: SSE_COPY_PS(XMM3,XMM6)
939: SSE_SHUFFLE(XMM3,XMM3,0xFF)
940: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
941: SSE_SUB_PS(XMM7,XMM3)
942: SSE_INLINE_END_2
944: v += 16;
945: }
946: idx = 4*i;
947: v = aa + 16*ai[++i];
948: PREFETCH_NTA(v);
949: STORE_PS(tmps,XMM7);
951: /* Promote result from float to double */
952: CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
953: }
954: /* backward solve the upper triangular */
955: idt = 4*(n-1);
956: ai16 = 16*diag[n-1];
957: v = aa + ai16 + 16;
958: for (i=n-1; i>=0;) {
959: PREFETCH_NTA(&v[8]);
960: vi = aj + diag[i] + 1;
961: nz = ai[i+1] - diag[i] - 1;
963: /* Demote accumulator from double to float */
964: CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
965: LOAD_PS(tmps,XMM7);
967: while (nz--) {
968: PREFETCH_NTA(&v[16]);
969: idx = 4*(*vi++);
971: /* Demote solution (so far) from double to float */
972: CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
974: /* 4x4 Matrix-Vector Product with negative accumulation: */
975: SSE_INLINE_BEGIN_2(tmpx,v)
976: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
978: /* First Column */
979: SSE_COPY_PS(XMM0,XMM6)
980: SSE_SHUFFLE(XMM0,XMM0,0x00)
981: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
982: SSE_SUB_PS(XMM7,XMM0)
984: /* Second Column */
985: SSE_COPY_PS(XMM1,XMM6)
986: SSE_SHUFFLE(XMM1,XMM1,0x55)
987: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
988: SSE_SUB_PS(XMM7,XMM1)
990: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
992: /* Third Column */
993: SSE_COPY_PS(XMM2,XMM6)
994: SSE_SHUFFLE(XMM2,XMM2,0xAA)
995: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
996: SSE_SUB_PS(XMM7,XMM2)
998: /* Fourth Column */
999: SSE_COPY_PS(XMM3,XMM6)
1000: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1001: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1002: SSE_SUB_PS(XMM7,XMM3)
1003: SSE_INLINE_END_2
1004: v += 16;
1005: }
1006: v = aa + ai16;
1007: ai16 = 16*diag[--i];
1008: PREFETCH_NTA(aa+ai16+16);
1009: /*
1010: Scale the result by the diagonal 4x4 block,
1011: which was inverted as part of the factorization
1012: */
1013: SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
1014: /* First Column */
1015: SSE_COPY_PS(XMM0,XMM7)
1016: SSE_SHUFFLE(XMM0,XMM0,0x00)
1017: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
1019: /* Second Column */
1020: SSE_COPY_PS(XMM1,XMM7)
1021: SSE_SHUFFLE(XMM1,XMM1,0x55)
1022: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
1023: SSE_ADD_PS(XMM0,XMM1)
1025: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
1027: /* Third Column */
1028: SSE_COPY_PS(XMM2,XMM7)
1029: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1030: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
1031: SSE_ADD_PS(XMM0,XMM2)
1033: /* Fourth Column */
1034: SSE_COPY_PS(XMM3,XMM7)
1035: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1036: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
1037: SSE_ADD_PS(XMM0,XMM3)
1039: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
1040: SSE_INLINE_END_3
1042: /* Promote solution from float to double */
1043: CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
1045: /* Apply reordering to t and stream into x. */
1046: /* This way, x doesn't pollute the cache. */
1047: /* Be careful with size: 2 doubles = 4 floats! */
1048: idc = 4*(*c--);
1049: SSE_INLINE_BEGIN_2((float*)&t[idt],(float*)&x[idc])
1050: /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */
1051: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
1052: SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
1053: /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
1054: SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
1055: SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
1056: SSE_INLINE_END_2
1057: v = aa + ai16 + 16;
1058: idt -= 4;
1059: }
1061: ISRestoreIndices(isrow,&rout);
1062: ISRestoreIndices(iscol,&cout);
1063: VecRestoreArray(bb,&b);
1064: VecRestoreArray(xx,&x);
1065: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1066: SSE_SCOPE_END;
1067: return 0;
1068: }
1070: #endif
1072: PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1073: {
1074: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
1075: IS iscol=a->col,isrow=a->row;
1076: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1077: PetscInt i,nz,idx,idt,idc;
1078: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
1079: const MatScalar *aa=a->a,*v;
1080: PetscScalar *x,s1,s2,s3,x1,x2,x3,*t;
1081: const PetscScalar *b;
1083: VecGetArrayRead(bb,&b);
1084: VecGetArray(xx,&x);
1085: t = a->solve_work;
1087: ISGetIndices(isrow,&rout); r = rout;
1088: ISGetIndices(iscol,&cout)); c = cout + (n-1;
1090: /* forward solve the lower triangular */
1091: idx = 3*(*r++);
1092: t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
1093: for (i=1; i<n; i++) {
1094: v = aa + 9*ai[i];
1095: vi = aj + ai[i];
1096: nz = diag[i] - ai[i];
1097: idx = 3*(*r++);
1098: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1099: while (nz--) {
1100: idx = 3*(*vi++);
1101: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1102: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1103: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1104: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1105: v += 9;
1106: }
1107: idx = 3*i;
1108: t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
1109: }
1110: /* backward solve the upper triangular */
1111: for (i=n-1; i>=0; i--) {
1112: v = aa + 9*diag[i] + 9;
1113: vi = aj + diag[i] + 1;
1114: nz = ai[i+1] - diag[i] - 1;
1115: idt = 3*i;
1116: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1117: while (nz--) {
1118: idx = 3*(*vi++);
1119: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1120: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1121: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1122: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1123: v += 9;
1124: }
1125: idc = 3*(*c--);
1126: v = aa + 9*diag[i];
1127: x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
1128: x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1129: x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1130: }
1131: ISRestoreIndices(isrow,&rout);
1132: ISRestoreIndices(iscol,&cout);
1133: VecRestoreArrayRead(bb,&b);
1134: VecRestoreArray(xx,&x);
1135: PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
1136: return 0;
1137: }
1139: PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1140: {
1141: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
1142: IS iscol=a->col,isrow=a->row;
1143: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1144: PetscInt i,nz,idx,idt,idc,m;
1145: const PetscInt *r,*c,*rout,*cout;
1146: const MatScalar *aa=a->a,*v;
1147: PetscScalar *x,s1,s2,s3,x1,x2,x3,*t;
1148: const PetscScalar *b;
1150: VecGetArrayRead(bb,&b);
1151: VecGetArray(xx,&x);
1152: t = a->solve_work;
1154: ISGetIndices(isrow,&rout); r = rout;
1155: ISGetIndices(iscol,&cout); c = cout;
1157: /* forward solve the lower triangular */
1158: idx = 3*r[0];
1159: t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
1160: for (i=1; i<n; i++) {
1161: v = aa + 9*ai[i];
1162: vi = aj + ai[i];
1163: nz = ai[i+1] - ai[i];
1164: idx = 3*r[i];
1165: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1166: for (m=0; m<nz; m++) {
1167: idx = 3*vi[m];
1168: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1169: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1170: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1171: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1172: v += 9;
1173: }
1174: idx = 3*i;
1175: t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
1176: }
1177: /* backward solve the upper triangular */
1178: for (i=n-1; i>=0; i--) {
1179: v = aa + 9*(adiag[i+1]+1);
1180: vi = aj + adiag[i+1]+1;
1181: nz = adiag[i] - adiag[i+1] - 1;
1182: idt = 3*i;
1183: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1184: for (m=0; m<nz; m++) {
1185: idx = 3*vi[m];
1186: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1187: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1188: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1189: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1190: v += 9;
1191: }
1192: idc = 3*c[i];
1193: x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
1194: x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1195: x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1196: }
1197: ISRestoreIndices(isrow,&rout);
1198: ISRestoreIndices(iscol,&cout);
1199: VecRestoreArrayRead(bb,&b);
1200: VecRestoreArray(xx,&x);
1201: PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
1202: return 0;
1203: }
1205: PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1206: {
1207: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
1208: IS iscol=a->col,isrow=a->row;
1209: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1210: PetscInt i,nz,idx,idt,idc;
1211: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
1212: const MatScalar *aa=a->a,*v;
1213: PetscScalar *x,s1,s2,x1,x2,*t;
1214: const PetscScalar *b;
1216: VecGetArrayRead(bb,&b);
1217: VecGetArray(xx,&x);
1218: t = a->solve_work;
1220: ISGetIndices(isrow,&rout); r = rout;
1221: ISGetIndices(iscol,&cout)); c = cout + (n-1;
1223: /* forward solve the lower triangular */
1224: idx = 2*(*r++);
1225: t[0] = b[idx]; t[1] = b[1+idx];
1226: for (i=1; i<n; i++) {
1227: v = aa + 4*ai[i];
1228: vi = aj + ai[i];
1229: nz = diag[i] - ai[i];
1230: idx = 2*(*r++);
1231: s1 = b[idx]; s2 = b[1+idx];
1232: while (nz--) {
1233: idx = 2*(*vi++);
1234: x1 = t[idx]; x2 = t[1+idx];
1235: s1 -= v[0]*x1 + v[2]*x2;
1236: s2 -= v[1]*x1 + v[3]*x2;
1237: v += 4;
1238: }
1239: idx = 2*i;
1240: t[idx] = s1; t[1+idx] = s2;
1241: }
1242: /* backward solve the upper triangular */
1243: for (i=n-1; i>=0; i--) {
1244: v = aa + 4*diag[i] + 4;
1245: vi = aj + diag[i] + 1;
1246: nz = ai[i+1] - diag[i] - 1;
1247: idt = 2*i;
1248: s1 = t[idt]; s2 = t[1+idt];
1249: while (nz--) {
1250: idx = 2*(*vi++);
1251: x1 = t[idx]; x2 = t[1+idx];
1252: s1 -= v[0]*x1 + v[2]*x2;
1253: s2 -= v[1]*x1 + v[3]*x2;
1254: v += 4;
1255: }
1256: idc = 2*(*c--);
1257: v = aa + 4*diag[i];
1258: x[idc] = t[idt] = v[0]*s1 + v[2]*s2;
1259: x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
1260: }
1261: ISRestoreIndices(isrow,&rout);
1262: ISRestoreIndices(iscol,&cout);
1263: VecRestoreArrayRead(bb,&b);
1264: VecRestoreArray(xx,&x);
1265: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
1266: return 0;
1267: }
1269: PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1270: {
1271: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
1272: IS iscol=a->col,isrow=a->row;
1273: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
1274: PetscInt i,nz,idx,jdx,idt,idc,m;
1275: const PetscInt *r,*c,*rout,*cout;
1276: const MatScalar *aa=a->a,*v;
1277: PetscScalar *x,s1,s2,x1,x2,*t;
1278: const PetscScalar *b;
1280: VecGetArrayRead(bb,&b);
1281: VecGetArray(xx,&x);
1282: t = a->solve_work;
1284: ISGetIndices(isrow,&rout); r = rout;
1285: ISGetIndices(iscol,&cout); c = cout;
1287: /* forward solve the lower triangular */
1288: idx = 2*r[0];
1289: t[0] = b[idx]; t[1] = b[1+idx];
1290: for (i=1; i<n; i++) {
1291: v = aa + 4*ai[i];
1292: vi = aj + ai[i];
1293: nz = ai[i+1] - ai[i];
1294: idx = 2*r[i];
1295: s1 = b[idx]; s2 = b[1+idx];
1296: for (m=0; m<nz; m++) {
1297: jdx = 2*vi[m];
1298: x1 = t[jdx]; x2 = t[1+jdx];
1299: s1 -= v[0]*x1 + v[2]*x2;
1300: s2 -= v[1]*x1 + v[3]*x2;
1301: v += 4;
1302: }
1303: idx = 2*i;
1304: t[idx] = s1; t[1+idx] = s2;
1305: }
1306: /* backward solve the upper triangular */
1307: for (i=n-1; i>=0; i--) {
1308: v = aa + 4*(adiag[i+1]+1);
1309: vi = aj + adiag[i+1]+1;
1310: nz = adiag[i] - adiag[i+1] - 1;
1311: idt = 2*i;
1312: s1 = t[idt]; s2 = t[1+idt];
1313: for (m=0; m<nz; m++) {
1314: idx = 2*vi[m];
1315: x1 = t[idx]; x2 = t[1+idx];
1316: s1 -= v[0]*x1 + v[2]*x2;
1317: s2 -= v[1]*x1 + v[3]*x2;
1318: v += 4;
1319: }
1320: idc = 2*c[i];
1321: x[idc] = t[idt] = v[0]*s1 + v[2]*s2;
1322: x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
1323: }
1324: ISRestoreIndices(isrow,&rout);
1325: ISRestoreIndices(iscol,&cout);
1326: VecRestoreArrayRead(bb,&b);
1327: VecRestoreArray(xx,&x);
1328: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
1329: return 0;
1330: }
1332: PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1333: {
1334: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data;
1335: IS iscol=a->col,isrow=a->row;
1336: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1337: PetscInt i,nz;
1338: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
1339: const MatScalar *aa=a->a,*v;
1340: PetscScalar *x,s1,*t;
1341: const PetscScalar *b;
1343: if (!n) return 0;
1345: VecGetArrayRead(bb,&b);
1346: VecGetArray(xx,&x);
1347: t = a->solve_work;
1349: ISGetIndices(isrow,&rout); r = rout;
1350: ISGetIndices(iscol,&cout)); c = cout + (n-1;
1352: /* forward solve the lower triangular */
1353: t[0] = b[*r++];
1354: for (i=1; i<n; i++) {
1355: v = aa + ai[i];
1356: vi = aj + ai[i];
1357: nz = diag[i] - ai[i];
1358: s1 = b[*r++];
1359: while (nz--) {
1360: s1 -= (*v++)*t[*vi++];
1361: }
1362: t[i] = s1;
1363: }
1364: /* backward solve the upper triangular */
1365: for (i=n-1; i>=0; i--) {
1366: v = aa + diag[i] + 1;
1367: vi = aj + diag[i] + 1;
1368: nz = ai[i+1] - diag[i] - 1;
1369: s1 = t[i];
1370: while (nz--) {
1371: s1 -= (*v++)*t[*vi++];
1372: }
1373: x[*c--] = t[i] = aa[diag[i]]*s1;
1374: }
1376: ISRestoreIndices(isrow,&rout);
1377: ISRestoreIndices(iscol,&cout);
1378: VecRestoreArrayRead(bb,&b);
1379: VecRestoreArray(xx,&x);
1380: PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);
1381: return 0;
1382: }
1384: PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
1385: {
1386: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1387: IS iscol = a->col,isrow = a->row;
1388: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz;
1389: const PetscInt *rout,*cout,*r,*c;
1390: PetscScalar *x,*tmp,sum;
1391: const PetscScalar *b;
1392: const MatScalar *aa = a->a,*v;
1394: if (!n) return 0;
1396: VecGetArrayRead(bb,&b);
1397: VecGetArray(xx,&x);
1398: tmp = a->solve_work;
1400: ISGetIndices(isrow,&rout); r = rout;
1401: ISGetIndices(iscol,&cout); c = cout;
1403: /* forward solve the lower triangular */
1404: tmp[0] = b[r[0]];
1405: v = aa;
1406: vi = aj;
1407: for (i=1; i<n; i++) {
1408: nz = ai[i+1] - ai[i];
1409: sum = b[r[i]];
1410: PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
1411: tmp[i] = sum;
1412: v += nz; vi += nz;
1413: }
1415: /* backward solve the upper triangular */
1416: for (i=n-1; i>=0; i--) {
1417: v = aa + adiag[i+1]+1;
1418: vi = aj + adiag[i+1]+1;
1419: nz = adiag[i]-adiag[i+1]-1;
1420: sum = tmp[i];
1421: PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
1422: x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */
1423: }
1425: ISRestoreIndices(isrow,&rout);
1426: ISRestoreIndices(iscol,&cout);
1427: VecRestoreArrayRead(bb,&b);
1428: VecRestoreArray(xx,&x);
1429: PetscLogFlops(2.0*a->nz - A->cmap->n);
1430: return 0;
1431: }