V3FIT
precon2d.f
1  MODULE precon2d
2  USE stel_kinds, ONLY: dp
3  USE vmec_dim
4  USE vmec_params
5  USE vparams, ONLY: nthreed, one, zero
6  USE vmec_input, ONLY: ntor, nzeta, lfreeb, lasym
7  USE timer_sub
8  USE safe_open_mod
9  USE directaccess
10  USE parallel_include_module
11 
12  IMPLICIT NONE
13 C-----------------------------------------------
14 C L o c a l V a r i a b l e s
15 C-----------------------------------------------
16  INTEGER, PRIVATE, PARAMETER :: sp = dp
17  INTEGER, PRIVATE :: ntyptot, m_2d, n_2d, ntype_2d
18  INTEGER, PRIVATE, ALLOCATABLE :: ipiv_blk(:,:)
19  INTEGER, PRIVATE :: mblk_size
20  INTEGER, PRIVATE :: mystart(3), myend(3)
21  LOGICAL, PRIVATE :: FIRSTPASS=.true.
22  REAL(sp),PRIVATE, ALLOCATABLE, DIMENSION(:,:,:,:,:,:,:) ::
23  1 block_diag, block_plus, block_mins,
24  2 block_dsave, block_msave, block_psave
25  REAL(sp),PRIVATE, ALLOCATABLE, DIMENSION(:,:,:,:,:,:) ::
26  1 block_diag_sw, block_plus_sw, block_mins_sw
27 
28  REAL(dp), PRIVATE, DIMENSION(:,:,:,:), ALLOCATABLE :: gc_save
29  REAL(dp) :: ctor_prec2d
30  INTEGER :: ictrl_prec2d
31  LOGICAL :: lHess_exact = .true., !FALSE -> FASTER, LESS ACCURATE VACUUM CALCULATION OF HESSIAN
32  1 l_backslv = .false.,
33  2 l_comp_prec2d = .true.,
34  3 l_edge = .false., !=T IF EDGE PERTURBED
35  4 edge_mesh(3)
36 
37  LOGICAL, PARAMETER, PRIVATE :: lscreen = .false.
38  INTEGER, PARAMETER, PRIVATE :: jstart(3) = (/1,2,3/)
39 
40  PRIVATE :: swap_forces, reswap_forces
41 
42 !
43 ! Direct-Access (swap to disk) stuff
44 ! CHARACTER(LEN=3) :: FlashDrive ="F:\"
45  CHARACTER(LEN=3) :: FlashDrive =""
46  CHARACTER(LEN=128) :: ScratchFile=""
47  INTEGER, PARAMETER :: blmin=1, bldia=2, blpls=3
48  REAL(sp), ALLOCATABLE, DIMENSION(:,:,:) :: DataItem
49  INTEGER :: iunit_dacess=10
50  LOGICAL :: lswap2disk = .false. !Set internally if blocks do not fit in memory
51  INTEGER, PARAMETER :: LOWER=3,diag=2,upper=1
52 
53 C-----------------------------------------------
54 !
55 ! SP: forces single precision for blocks (smaller size)
56 ! ICTRL_PREC2D: controls initialization and application of 2d block preconditioner
57 ! = 0, no preconditioner applied
58 ! = 1, apply preconditioner
59 ! = 2, initial call of funct3d to set up residue vector, store saved vectors,
60 ! and (for .not.lasym case), call LAMBLKS routine
61 ! = 3, radial jog vector is being computed to calculate hessian elements
62 ! L_BACKSLV: if true, test that Hessian is inverted correctly by back-solving
63 ! LHESS_EXACT : if true, edge value of ctor (in bcovar) is computed as a constant to make
64 ! Hessian symmetric. Also, sets ivacskip=0 in call to vacuum in computation of
65 ! Hessian. However, the ivacskip=0 option is (very) slow and found not to be necessary
66 ! in practice. Set this true primarily for debugging purposes (check Ap ~ -p in MatVec
67 ! routine, for example, in GMRes module)
68 !
69 
70  CONTAINS
71 
72  SUBROUTINE swap_forces(gc, temp, mblk, nblocks)
73 C-----------------------------------------------
74 C D u m m y A r g u m e n t s
75 C-----------------------------------------------
76  INTEGER :: mblk, nblocks
77  REAL(dp), DIMENSION(nblocks,mblk), INTENT(in) :: gc
78  REAL(dp), DIMENSION(mblk,nblocks), INTENT(out) :: temp
79 C-----------------------------------------------
80 !
81 ! reorders forces (gc) array prior to applying
82 ! block-tridiagonal pre-conditioner. on exit, temp is the reordered array
83 ! flip sign so eigenvalue is negative (corresponding to damping)
84 !
85  temp = -transpose(gc)
86 
87  END SUBROUTINE swap_forces
88 
89  SUBROUTINE reswap_forces(temp, gc, mblk, nblocks)
90 C-----------------------------------------------
91 C D u m m y A r g u m e n t s
92 C-----------------------------------------------
93  INTEGER :: mblk, nblocks
94  REAL(dp), DIMENSION(nblocks,mblk), INTENT(inout) :: gc
95  REAL(dp), DIMENSION(mblk,nblocks), INTENT(in) :: temp
96 C-----------------------------------------------
97 !
98 ! Following application of block pre-conditioner, restores original
99 ! order of forces (gc) array previously ordered by call to "swap_forces"
100 !
101  gc = transpose(temp)
102 
103  END SUBROUTINE reswap_forces
104 
105  SUBROUTINE block_precond(gc)
106 C-----------------------------------------------
107 C D u m m y A r g u m e n t s
108 C-----------------------------------------------
109  REAL(dp), DIMENSION(ns,0:ntor,0:mpol1,ntyptot) :: gc
110 C-----------------------------------------------
111 C L o c a l V a r i a b l e s
112 C-----------------------------------------------
113  INTEGER :: mblk, m, n, js, ntype, istat
114  REAL(dp), ALLOCATABLE, DIMENSION(:,:) :: temp
115  REAL(dp) :: t1, error
116 C-----------------------------------------------
117 !
118 ! Applies 2D block-preconditioner to forces vector (gc)
119 !
120  IF (ntyptot .le. 0) stop 'ntyptot must be > 0'
121 
122  IF (l_backslv) gc_save = gc
123 
124  mblk = ntyptot*mnsize
125  ALLOCATE (temp(mblk,ns), stat=ntype)
126  IF (ntype .ne. 0) stop 'Allocation error1 in block_precond'
127 
128 ! Reorder gc(JS,MN) -> temp(MN,JS)
129  CALL swap_forces(gc, temp, mblk, ns)
130 
131 ! Apply preconditioner to temp, using LU factors stored in block_... matrices
132  IF (lswap2disk) THEN
133  CALL blk3d_slv_swp(temp, ipiv_blk, mblk, ns)
134  ELSE
135  CALL blk3d_slv(block_diag, block_mins, block_plus, temp,
136  1 ipiv_blk, mblk, ns)
137  END IF
138 
139 ! Restores original ordering (after preconditioner applied): temp(MN,JS) -> gc(JS,MN)
140  CALL reswap_forces(temp, gc, mblk, ns)
141 
142  IF (l_backslv) THEN
143  l_backslv = .false.
144 
145  WRITE (6, *) ' Writing block Hessian check to unit 34'
146  WRITE (34, *)
147  WRITE (34, *) ' BLK3D FACTORIZATION CHECK: Ax = b ?'
148 
149  DO n = 0, ntor
150  WRITE (34, *) ' N = ', n
151  DO m = 0, mpol1
152  WRITE (34, *) ' M = ', m
153  DO ntype = 1, ntyptot
154  WRITE (34, *) ' TYPE = ', ntype
155  WRITE (34, *)
156  1 ' js Ax b Ax - b' //
157  2 ' RelErr'
158  js = 1
159  t1 = sum(block_dsave(n,m,ntype,:,:,:,js)*gc(js,:,:,:)
160  1 + block_psave(n,m,ntype,:,:,:,js)*gc(js+1,:,:,:))
161 
162  error = t1 + gc_save(js,n,m,ntype)
163  IF (t1 .eq. zero) t1 = epsilon(t1)
164  WRITE (34, 100) js, t1, -gc_save(js,n,m,ntype),
165  2 error, error/t1
166 
167  DO js = 2, ns-1
168  t1 = sum(
169  1 block_msave(n,m,ntype,:,:,:,js)*gc(js-1,:,:,:)
170  2 + block_dsave(n,m,ntype,:,:,:,js)*gc(js,:,:,:)
171  3 + block_psave(n,m,ntype,:,:,:,js)*gc(js+1,:,:,:))
172  error = t1 + gc_save(js,n,m,ntype)
173  IF (t1 .eq. zero) t1 = epsilon(t1)
174  WRITE (34, 100) js, t1, -gc_save(js,n,m,ntype),
175  2 error, error/t1
176  END DO
177 
178  js = ns
179  t1 = sum(block_msave(n,m,ntype,:,:,:,js)*gc(js-1,:,:,:)
180  1 + block_dsave(n,m,ntype,:,:,:,js)*gc(js,:,:,:))
181  error = t1 + gc_save(js,n,m,ntype)
182  IF (t1 .eq. zero) t1 = epsilon(t1)
183  WRITE (34, 100) js, t1, -gc_save(js,n,m,ntype),
184  2 error, error/t1
185  END DO
186  END DO
187  END DO
188 
189  IF (.not.l_backslv) DEALLOCATE(block_dsave, block_msave,
190  1 block_psave, gc_save, stat=istat)
191 
192  100 FORMAT(i6,1p,4e14.4)
193  END IF
194 
195 
196  DEALLOCATE (temp, stat=istat)
197 
198  END SUBROUTINE block_precond
199 
200  SUBROUTINE block_precond_par(gc)
201  USE blocktridiagonalsolver, ONLY: setmatrixrhs
202  USE blocktridiagonalsolver, ONLY: backwardsolve
203  USE blocktridiagonalsolver, ONLY: getsolutionvector
204  USE parallel_include_module
205 C-----------------------------------------------
206 C D u m m y A r g u m e n t s
207 C-----------------------------------------------
208  REAL(dp), DIMENSION(0:ntor,0:mpol1,ns,ntyptot) :: gc
209 C-----------------------------------------------
210 C L o c a l V a r i a b l e s
211 C-----------------------------------------------
212  INTEGER :: mblk, istat, globrow
213  REAL(dp), ALLOCATABLE, DIMENSION(:,:) :: tmp
214  REAL(dp) :: ton, toff
215  REAL(dp), DIMENSION (:,:), ALLOCATABLE :: solvec
216 C-----------------------------------------------
217  IF (.NOT.lactive) THEN
218  RETURN
219  END IF
220 !
221 ! Applies 2D block-preconditioner to forces vector (gc)
222 !
223  IF (ntyptot .LE. 0) THEN
224  stop 'ntyptot must be > 0'
225  END IF
226 
227  mblk = ntyptot*mnsize
228 
229 ! Apply preconditioner to temp, using LU factors stored in block_... matrices
230 
231  ALLOCATE (tmp(mblk,ns), stat=istat)
232  CALL tolastns(gc, tmp)
233  tmp(:,tlglob:trglob) = -tmp(:,tlglob:trglob)
234 
235  DO globrow=tlglob, trglob
236  CALL setmatrixrhs(globrow,tmp(:,globrow))
237  END DO
238  DEALLOCATE (tmp, stat=istat)
239 
240  CALL second0(ton)
241  CALL backwardsolve
242  CALL second0(toff)
243  bcyclic_backwardsolve_time=bcyclic_backwardsolve_time+(toff-ton)
244 
245  ALLOCATE (solvec(mblk,ns), stat=istat)
246  IF (istat .NE. 0) THEN
247  stop 'Allocation error in block_precond before gather'
248  END IF
249 
250  DO globrow=tlglob, trglob
251  CALL getsolutionvector (globrow,solvec(:,globrow))
252  END DO
253 
254  CALL tolastntype(solvec, gc)
255 
256  CALL gather4xarray(gc)
257 
258  DEALLOCATE (solvec)
259 
260  END SUBROUTINE block_precond_par
261 
262  SUBROUTINE compute_blocks_par (xc, xcdot, gc)
263  USE blocktridiagonalsolver, ONLY: forwardsolve
264  USE parallel_include_module
265  USE vmec_main, ONLY: iter2
266 C-----------------------------------------------
267 C D u m m y A r g u m e n t s
268 C-----------------------------------------------
269  REAL(dp),DIMENSION(0:ntor,0:mpol1,ns,3*ntmax) :: xc, gc, xcdot
270 C-----------------------------------------------
271 C L o c a l V a r i a b l e s
272 C-----------------------------------------------
273  REAL(dp), PARAMETER :: p5 = 0.5_dp
274  INTEGER :: m, n, i, ntype, istat, mblk, ibsize, iunit
275  REAL(dp) :: time_on, time_off, bsize, tprec2don, tprec2doff
276  REAL(dp) :: ton, toff
277  CHARACTER(LEN=100):: label
278  LOGICAL, PARAMETER :: lscreen = .false.
279  INTEGER :: j, k, l
280 C-----------------------------------------------
281 !
282 ! COMPUTES THE JACOBIAN BLOCKS block_mins, block_diag, block_plus
283 ! USING EITHER A SLOW - BUT RELIABLE - "JOG" TECHNIQUE, OR
284 ! USING PARTIAL ANALYTIC FORMULAE.
285 !
286 ! THE SUBROUTINE lam_blks IS CALLED FROM BCOVAR TO COMPUTE
287 ! THE ANALYTIC BLOCK ELEMENTS
288 !
289  CALL second0(tprec2don)
290 
291  IF (l_backslv .and. sp.ne.dp) THEN
292  stop 'Should set sp = dp!'
293  END IF
294 
295  ntyptot = SIZE(gc,4)
296  IF (ntyptot .NE. 3*ntmax) THEN
297  stop ' NTYPTOT != 3*ntmax'
298  END IF
299  mblk = ntyptot*mnsize
300 
301  bsize = real(mblk*mblk, dp)*3*kind(block_diag)
302  IF (bsize .gt. huge(mblk)) THEN
303  WRITE (6, *) ' bsize: ', bsize, ' exceeds HUGE(int): ',
304  & huge(mblk)
305 ! WRITE (6, *) ' Blocks will be written to disk.'
306 ! lswap2disk = .TRUE.
307  ELSE
308  lswap2disk = .false.
309  END IF
310 
311  bsize = bsize*ns
312  IF (bsize .lt. 1.e6_dp) THEN
313  ibsize = bsize/1.e1_dp
314  label = " Kb"
315  ELSE IF (bsize .lt. 1.e9_dp) THEN
316  ibsize = bsize/1.e4_dp
317  label = " Mb"
318  ELSE
319  ibsize = bsize/1.e7_dp
320  label = " Gb"
321  END IF
322 
323  DO i = 1,2
324  IF (i .eq. 1) THEN
325  iunit = 6
326  END IF
327  IF (i .eq. 2) THEN
328  iunit = nthreed
329  END IF
330  IF (grank.EQ.0) THEN
331  WRITE (iunit, '(/,2x,a,i5,a,/,2x,a,i5,a)')
332  & 'Initializing 2D block preconditioner at ', iter2,
333  & ' iterations',
334  & 'Estimated time to compute Hessian = ',
335  & 3*ntyptot*mnsize,' VMEC time steps'
336  WRITE (iunit, '(2x,a,i4,a,f12.2,a)') 'Block dim: ', mblk,
337  & '^2 Preconditioner size: ', real(ibsize)/100,
338  & trim(label)
339  END IF
340  END DO
341 !
342 ! COMPUTE AND STORE BLOCKS (MN X MN) FOR PRECONDITIONER
343 !
344  CALL second0(time_on)
345 
346  ALLOCATE (gc_save(0:ntor,0:mpol1,ns,ntyptot), stat=istat)
347  IF (istat .NE. 0) THEN
348  stop 'Allocation error: gc_save in compute_blocks'
349  END IF
350 
351  IF (ALLOCATED(block_diag)) THEN
352  DEALLOCATE (block_diag, block_plus, block_mins, stat=istat)
353  IF (istat .ne. 0) THEN
354  stop 'Deallocation error in compute blocks'
355  END IF
356  END IF
357 
358 !
359 ! GENERAL (SLOWER BY 2/3 THAN SYMMETRIC VERSION) METHOD: ASSUMES NO SYMMETRIES OF R, Z COEFFICIENTS
360 !
361  CALL sweep3_blocks_par (xc, xcdot, gc)
362  IF (lactive) THEN
363  CALL compute_col_scaling_par
364  END IF
365 
366  ictrl_prec2d = 1 !Signals funct3d (residue) to use block preconditioner
367 
368  CALL second0(time_off)
369  IF (grank .EQ. 0) THEN
370  WRITE (6,1000) time_off - time_on
371  WRITE (nthreed,1000) time_off - time_on
372  END IF
373 
374 !
375 ! FACTORIZE HESSIAN
376 !
377  CALL second0(time_on)
378  IF (ALLOCATED(ipiv_blk)) THEN
379  DEALLOCATE(ipiv_blk, stat=ntype)
380  END IF
381  ALLOCATE (ipiv_blk(mblk,ns), stat=ntype)
382  IF (ntype .ne. 0) THEN
383  stop 'Allocation error2 in block_precond'
384  END IF
385 
386  CALL second0(ton)
387  IF (lactive) THEN
388  CALL forwardsolve
389  END IF
390 
391  CALL second0(time_off)
392  toff = time_off
393  bcyclic_forwardsolve_time = bcyclic_forwardsolve_time
394  & + (toff - ton)
395 
396  IF (grank.EQ.0) THEN
397  WRITE(6,1001) time_off - time_on
398  WRITE(nthreed,1001) time_off - time_on
399  END IF
400 
401  IF (.NOT.l_backslv) THEN
402  DEALLOCATE (gc_save)
403  END IF
404 
405  CALL second0(tprec2doff)
406 
407  timer(tprec2d) = timer(tprec2d) + (tprec2doff - tprec2don)
408  compute_blocks_time = compute_blocks_time
409  & + (tprec2doff - tprec2don)
410 
411 1000 FORMAT(1x,' Time to compute blocks: ',f10.2,' s')
412 1001 FORMAT(1x,' Time to factor blocks: ',f10.2,' s')
413 
414  END SUBROUTINE compute_blocks_par
415 
416  SUBROUTINE sweep3_blocks_par(xc, xcdot, gc)
417  USE vmec_main, ONLY: ncurr, r01, z01, lthreed, chips, delt0r
418  USE blocktridiagonalsolver, ONLY: initialize, setblockrowcol
419  USE blocktridiagonalsolver, ONLY: writeblocks
420  USE parallel_vmec_module, ONLY: mpi_stat
421 C-----------------------------------------------
422 C D u m m y A r g u m e n t s
423 C-----------------------------------------------
424  REAL(dp), DIMENSION(0:ntor,0:mpol1,ns,ntyptot) :: xc, xcdot, gc
425 C-----------------------------------------------
426 C L o c a l V a r i a b l e s
427 C-----------------------------------------------
428  INTEGER :: js, js1, istat, mesh, lamtype, rztype, icol
429  INTEGER :: nsmin, nsmax
430  INTEGER :: lastrank, left, right
431  REAL(dp) :: eps, hj, hj_scale
432  REAL(dp), ALLOCATABLE, DIMENSION(:) :: diag_val
433  REAL(dp) :: ton, toff
434 C-----------------------------------------------
435 !
436 ! COMPUTE FORCE "RESPONSE" TO PERTURBATION AT EVERY 3rd RADIAL POINT
437 ! FOR EACH MESH STARTING AT js=1,2,3, RESPECTIVELY
438 !
439  CALL second0(ton)
440  ALLOCATE(diag_val(ns), stat=istat)
441  diag_val=zero
442  IF (grank.EQ.0) THEN
443  WRITE (6, *)
444  & " Using non-symmetric sweep to compute Hessian elements"
445  END IF
446 
447  eps = sqrt(epsilon(eps))
448  eps = eps/10
449  rztype = 2*ntmax
450  lamtype = rztype + 1
451 
452  n_2d = 0
453  m_2d = 0
454 
455  ALLOCATE(dataitem(0:ntor,0:mpol1,1:3*ntmax), stat=istat)
456  IF (istat .ne. 0) THEN
457  stop 'Allocation error in sweep3_blocks'
458  END IF
459 
460 !
461 ! CALL FUNCT3D FIRST TIME TO STORE INITIAL UN-PRECONDITIONED FORCES
462 ! THIS WILL CALL LAMBLKS (SO FAR, ONLY IMPLEMENTED FOR lasym = false)
463 !
464  ictrl_prec2d = 2 !Signals funct3d that preconditioner is being initialized
465  CALL funct3d_par(lscreen, istat)
466  IF (istat .NE. 0) THEN
467  print *,' ier_flag = ', istat,
468  1 ' in SWEEP3_BLOCKS_PAR call to funct3d_par'
469  stop
470  ENDIF
471 
472  nsmin = t1lglob
473  nsmax = t1rglob
474  xcdot(:,:,nsmin:nsmax,:) = 0
475 ! PRINT *,'rank: ', rank,' vrank: ', vrank,' nsmin: ',nsmin,
476 ! 1 ' nsmax: ', nsmax
477 
478  IF (firstpass) THEN
479  edge_mesh = .false.
480  firstpass = .false.
481  mblk_size = (ntor + 1)*(mpol1 + 1)*3*ntmax
482  IF (mblk_size .NE. ntmaxblocksize) THEN
483  stop 'wrong mblk_size in precon2d!'
484  END IF
485  CALL initialize(.false.,ns,mblk_size)
486  myend = nsmax
487 !Align starting pt in (nsmin,nsmax)
488  DO mesh = 1, 3
489  icol = mod(jstart(mesh) - nsmin, 3)
490  IF (icol .LT. 0) THEN
491  icol = icol + 3
492  END IF
493  mystart(mesh) = nsmin + icol
494  IF (mod(jstart(mesh) - ns, 3) .EQ. 0) THEN ! .AND. nsmax.EQ.ns)
495  edge_mesh(mesh) = .true.
496  END IF
497  END DO
498  END IF
499 
500 ! STORE chips in xc
501 #if defined(CHI_FORCE)
502  IF (ncurr .EQ. 1) THEN
503  xc(0,0,nsmin:nsmax,lamtype) = chips(nsmin:nsmax)
504  END IF
505 #endif
506  left = rank - 1
507  IF (rank .EQ. 0) THEN
508  left = mpi_proc_null
509  END IF
510  right = rank + 1
511  IF (rank .EQ. nranks - 1) THEN
512  right = mpi_proc_null
513  END IF
514 
515  CALL padsides(xc)
516  CALL padsides(gc)
517  CALL padsides(xcdot)
518 
519  CALL restart_iter(delt0r)
520  gc_save(:,:,nsmin:nsmax,:) = gc(:,:,nsmin:nsmax,:)
521  ictrl_prec2d = 3 !Signals funct3d that preconditioner is being computed
522 
523  CALL mpi_comm_size(ns_comm,lastrank,mpi_err)
524  lastrank = lastrank - 1
525 !
526 ! FIRST DO R00 JOG TO LOAD DIAG_VAL ARRAY (DO NOT RELY ON IT BEING THE FIRST JOG)
527 !
528  m_2d=0
529  n_2d=0
530 #ifdef _HBANGLE
531  ntype_2d = zsc + ntmax
532 #else
533  ntype_2d = rcc
534 #endif
535 ! APPLY JOG
536  hj = eps * max(abs(r01(ns)), abs(z01(ns)))
537  IF (nranks .GT. 1) THEN
538  CALL mpi_bcast(hj,1,mpi_real8,lastrank,ns_comm,mpi_err)
539  CALL mpi_bcast(edge_mesh,3,mpi_logical,lastrank,ns_comm, &
540  & mpi_err)
541  END IF
542  DO js = mystart(1), myend(1), 3
543  xcdot(n_2d,m_2d,js,ntype_2d) = hj
544  END DO
545 
546  istat = 0
547  CALL funct3d_par (lscreen, istat)
548 
549  lactive0: IF (lactive) THEN
550  IF (nranks .GT. 1) THEN
551  CALL gather4xarray(gc)
552  END IF
553  IF (istat .NE. 0) THEN
554  stop 'Error computing Hessian jog!'
555  END IF
556 ! CLEAR JOG AND STORE BLOCKS FOR THIS JOG
557  xcdot(:,:,nsmin:nsmax,:) = 0
558  DO js = mystart(1), myend(1), 3
559  dataitem = (gc(:,:,js,:) - gc_save(:,:,js,:))/hj
560  diag_val(js) = dataitem(0,0,ntype_2d)
561  END DO
562 
563  IF (nranks .GT. 1) THEN
564  icol = 0
565  IF (trglob_arr(1) .LT. 4) THEN
566  icol = 1
567  END IF
568  CALL mpi_bcast(diag_val(4), 1, mpi_real8, icol, ns_comm,
569  & mpi_err)
570  icol = nranks - 1
571  IF (tlglob_arr(nranks) .GT. ns - 3) THEN
572  icol = nranks-2
573  END IF
574  CALL mpi_bcast(diag_val(ns - 3), 1, mpi_real8, icol,
575  & ns_comm, mpi_err)
576  END IF
577  IF (diag_val(1) .EQ. zero) THEN
578  diag_val(1) = diag_val(4)
579  END IF
580  IF (diag_val(ns) .EQ. zero) THEN
581  diag_val(ns) = diag_val(ns - 3)
582  END IF
583 
584  IF (nranks .GT. 1) THEN
585  CALL mpi_sendrecv(diag_val(trglob), 1, mpi_real8, right, 1,
586  & diag_val(t1lglob), 1, mpi_real8, left, 1,
587  & ns_comm, mpi_stat, mpi_err)
588  END IF
589  DO js = mystart(2), myend(2), 3
590  diag_val(js) = diag_val(js - 1)
591  END DO
592 
593  hj_scale = max(abs(r01(ns)), abs(z01(ns)))
594 
595  IF (nranks .GT. 1) THEN
596  CALL mpi_sendrecv(diag_val(trglob), 1, mpi_real8, right, 1,
597  & diag_val(t1lglob), 1, mpi_real8, left, 1,
598  & ns_comm, mpi_stat, mpi_err)
599  CALL mpi_bcast(hj_scale, 1, mpi_real8, lastrank, ns_comm,
600  & mpi_err)
601  END IF
602 
603  DO js = mystart(3), myend(3), 3
604  diag_val(js) = diag_val(js - 1)
605  END DO
606 
607  IF (any(diag_val(tlglob:trglob) .EQ. zero)) THEN
608  print *, 'For rank: ', rank, ' some diag_val == 0'
609  stop
610  END IF
611  END IF lactive0
612 !
613 ! PERFORM "JOGS" FOR EACH VARIABLE AT EVERY 3rd RADIAL POINT ACROSS MESH
614 ! FOR ntyp = (Rcc, Rss, Rsc, Rcs, Zsc, Zcs, Zcc, Zss)
615 ! AND EVERY n2d (toroidal mode index) and EVERY m2d (poloidal mode index)
616 
617  icol=0
618 
619  ntype2d: DO ntype_2d = 1, ntyptot
620  hj = eps
621  IF (ntype_2d .LT. lamtype) THEN
622  hj = hj*hj_scale
623  END IF
624 
625  m2d: DO m_2d = 0, mpol1
626 
627  n2d: DO n_2d = 0, ntor
628 
629  icol = icol + 1
630 
631  mesh_3pt: DO mesh = 1,3
632 
633 ! APPLY JOG TO ACTIVE PROCESSORS
634  IF (lactive) THEN
635  DO js = mystart(mesh), myend(mesh), 3
636  xcdot(n_2d,m_2d,js,ntype_2d) = hj
637  END DO
638  IF (m_2d.GT.0 .AND. mystart(mesh).EQ.1) THEN
639  xcdot(n_2d,m_2d,1,ntype_2d) = 0
640  END IF
641  END IF
642 
643  l_edge = edge_mesh(mesh)
644  CALL funct3d_par (lscreen, istat)
645  IF (istat .NE. 0) stop 'Error computing Hessian jog!'
646 
647 !
648 ! COMPUTE PRECONDITIONER (HESSIAN) ELEMENTS. LINEARIZED EQUATIONS
649 ! OF FORM (FIXED mn FOR SIMPLICITY):
650 !
651 ! F(j-1) = a(j-1)x(j-2) + d(j-1)x(j-1) + b(j-1)x(j)
652 ! F(j) = a(j)x(j-1) + d(j) x(j) + b(j) x(j+1)
653 ! F(j+1) = a(j+1)x(j) + d(j+1)x(j+1) + b(j+1)x(j+2)
654 !
655 ! HESSIAN IS H(k,j) == dF(k)/dx(j); aj == block_mins; dj == block_diag; bj = block_plus
656 !
657 ! THUS, A PERTURBATION (in xc) AT POSITION js PRODUCES THE FOLLOWING RESULTS:
658 !
659 ! d(js) = dF(js )/hj(js)
660 ! b(js-1) = dF(js-1)/hj(js)
661 ! a(js+1) = dF(js+1)/hj(js)
662 !
663 !
664  lactive1: IF (lactive) THEN
665  skip3_mesh: DO js = mystart(mesh), myend(mesh), 3
666 
667 ! CLEAR JOG AND STORE BLOCKS FOR THIS JOG
668  xcdot(n_2d,m_2d,js,ntype_2d) = 0
669 
670  !block_mins(js+1)
671  js1 = js+1
672  IF (tlglob.LE.js1 .AND. js1.LE.trglob) THEN
673  dataitem =
674  & (gc(:,:,js1,:)-gc_save(:,:,js1,:))/hj
675  CALL setblockrowcol(js1,icol,dataitem,lower)
676  END IF
677 
678  !block_diag(js)
679  IF (tlglob.LE.js .AND. js.LE.trglob) THEN
680  dataitem = (gc(:,:,js,:)
681  & - gc_save(:,:,js,:))/hj
682 
683  IF (rank .EQ. lastrank .AND.
684  & js .EQ. ns .AND.
685  & .NOT.lfreeb .AND.
686  & any(dataitem(:,:,
687  & 1:rztype) .NE. zero)) THEN
688  stop 'DIAGONAL BLOCK AT EDGE != 0'
689  END IF
690 
691 !Levenberg-like offset - do NOT apply here if applied in colscaling routine
692  IF (ntype_2d .GE. lamtype) THEN
693  dataitem(n_2d,m_2d,ntype_2d) =
694  & 1.0001_dp*dataitem(n_2d,m_2d,ntype_2d)
695  END IF
696 
697  IF (dataitem(n_2d,m_2d,
698  & ntype_2d) .EQ. zero) THEN
699  dataitem(n_2d,m_2d,ntype_2d) =
700  & diag_val(js)
701  END IF
702 
703  CALL setblockrowcol(js,icol,dataitem,diag)
704  END IF
705 
706  !block_plus(js-1)
707  js1 = js - 1
708  IF (tlglob .LE. js1 .AND. js1 .LE. trglob) THEN
709  dataitem = (gc(:,:,js1,:) -
710  & gc_save(:,:,js1,:))/hj
711  CALL setblockrowcol(js1,icol,dataitem,upper)
712  END IF
713  END DO skip3_mesh
714  END IF lactive1
715 
716  END DO mesh_3pt
717  END DO n2d
718  END DO m2d
719  END DO ntype2d
720 
721  l_edge = .false.
722 
723  DEALLOCATE(dataitem, diag_val)
724  CALL second0(toff)
725  fill_blocks_time=fill_blocks_time + (toff - ton)
726 
727  END SUBROUTINE sweep3_blocks_par
728 
729  SUBROUTINE compute_blocks(xc, xcdot, gc)
730  USE vmec_main, ONLY: iter2
731 C-----------------------------------------------
732 C D u m m y A r g u m e n t s
733 C-----------------------------------------------
734  REAL(dp),DIMENSION(ns,0:ntor,0:mpol1,3*ntmax) :: xc, gc, xcdot
735 C-----------------------------------------------
736 C L o c a l V a r i a b l e s
737 C-----------------------------------------------
738  REAL(dp), PARAMETER :: p5 = 0.5_dp
739  INTEGER :: m, n, i, ntype, istat,
740  1 mblk, ibsize, iunit
741  REAL(dp) :: time_on, time_off, bsize, tprec2don, tprec2doff
742  CHARACTER(LEN=100):: label
743  LOGICAL, PARAMETER :: lscreen = .false.
744 C-----------------------------------------------
745 !
746 ! COMPUTES THE JACOBIAN BLOCKS block_mins, block_diag, block_plus
747 ! USING EITHER A SLOW - BUT RELIABLE - "JOG" TECHNIQUE, OR
748 ! USING PARTIAL ANALYTIC FORMULAE.
749 !
750 ! THE SUBROUTINE lam_blks IS CALLED FROM BCOVAR TO COMPUTE
751 ! THE ANALYTIC BLOCK ELEMENTS
752 !
753  CALL second0(tprec2don)
754  IF (l_backslv .and. sp.ne.dp) THEN
755  stop 'Should set sp = dp!'
756  END IF
757 
758  ntyptot = SIZE(gc,4)
759  mblk = ntyptot*mnsize
760 
761  bsize = real(mblk*mblk, dp)*3*ns*kind(block_diag)
762 ! IF (bsize .gt. HUGE(mblk)) THEN
763 ! WRITE (6, *) ' bsize: ', INT(bsize,SELECTED_INT_KIND(12)),
764 ! 1 ' exceeds HUGE(int): ', HUGE(mblk)
765 ! WRITE (6, *) ' Blocks will be written to disk.'
766 ! lswap2disk = .TRUE.
767 ! ELSE
768  lswap2disk = .false.
769 ! END IF
770 
771  IF (bsize .lt. 1.e6_dp) THEN
772  ibsize = bsize/1.e1_dp
773  label = " Kb"
774  ELSE IF (bsize .lt. 1.e9_dp) THEN
775  ibsize = bsize/1.e4_dp
776  label = " Mb"
777  ELSE
778  ibsize = bsize/1.e7_dp
779  label = " Gb"
780  END IF
781 
782  WRITE (6, 1000) iter2, 3*ntyptot*mnsize, mblk, real(ibsize)/100,
783  & trim(label)
784  WRITE (nthreed, 1000) iter2, 3*ntyptot*mnsize, mblk,
785  & real(ibsize)/100, trim(label)
786 !
787 ! COMPUTE AND STORE BLOCKS (MN X MN) FOR PRECONDITIONER
788 !
789  CALL second0(time_on)
790 
791  ALLOCATE (gc_save(ns,0:ntor,0:mpol1,ntyptot), stat=istat)
792  IF (istat .ne. 0) THEN
793  stop 'Allocation error: gc_save in compute_blocks'
794  END IF
795 
796  IF (ALLOCATED(block_diag)) THEN
797  DEALLOCATE (block_diag, block_plus, block_mins, stat=istat)
798  IF (istat .ne. 0) THEN
799  stop 'Deallocation error in compute blocks'
800  END IF
801  ELSE IF (ALLOCATED(block_diag_sw)) THEN
802  DEALLOCATE (block_diag_sw, block_plus_sw, block_mins_sw,
803  & stat=istat)
804  IF (istat .ne. 0) THEN
805  stop 'Deallocation error in compute blocks'
806  END IF
807  END IF
808 
809  ALLOCATE (block_diag(0:ntor,0:mpol1,ntyptot,
810  & 0:ntor,0:mpol1,ntyptot,ns),
811  & block_plus(0:ntor,0:mpol1,ntyptot,
812  & 0:ntor,0:mpol1,ntyptot,ns),
813  & block_mins(0:ntor,0:mpol1,ntyptot,
814  & 0:ntor,0:mpol1,ntyptot,ns),
815  & stat=istat)
816 
817  lswap2disk = (istat .NE. 0)
818 
819 !FOR DEBUGGING, SET THIS TO TRUE
820 ! lswap2disk = .TRUE.
821 
822  IF (lswap2disk) THEN
823  WRITE (6,'(a,i4,a)') ' Allocation error(1) = ', istat,
824  & ': Not enough memory in compute_blocks'
825  WRITE (6,*) ' Writing blocks to disk file'
826  ALLOCATE (block_diag_sw(0:ntor,0:mpol1,ntyptot,
827  & 0:ntor,0:mpol1,ntyptot),
828  & block_plus_sw(0:ntor,0:mpol1,ntyptot,
829  & 0:ntor,0:mpol1,ntyptot),
830  & block_mins_sw(0:ntor,0:mpol1,ntyptot,
831  & 0:ntor,0:mpol1,ntyptot),
832  & stat=istat)
833 
834  IF (istat .ne. 0) THEN
835  WRITE (6,'(a,i4)') ' Allocation error(2) = ', istat
836  stop
837  END IF
838 
839 ! Open DIRECT ACCESS file for writing blocks to disk
840 ! FIRST, we need to compute one row (in m,n,ntype-space) at a
841 ! time (not the full block). So we use a record size = mblk
842 ! with a block size = mblk**2. We will then close this and re-open
843 ! it with a record size = mblk**2 do deal with full blocks
844  scratchfile = "PRCND2A.bin"
845  IF (flashdrive .ne. "") THEN
846  scratchfile = flashdrive // scratchfile
847  END IF
848  CALL opendafile(mblk, mblk**2, 3, scratchfile, iunit_dacess, 0)
849  scratchfile = "PRCND2B.bin"
850  IF (flashdrive .ne. "") THEN
851  scratchfile = flashdrive // scratchfile
852  END IF
853  block_plus_sw = 0
854  block_mins_sw = 0
855  block_diag_sw = 0
856  ELSE
857  block_plus = 0
858  block_mins = 0
859  block_diag = 0
860  END IF
861 
862 !
863 ! GENERAL (SLOWER BY 2/3 THAN SYMMETRIC VERSION) METHOD: ASSUMES NO SYMMETRIES OF R, Z COEFFICIENTS
864 !
865  CALL sweep3_blocks (xc, xcdot, gc)
866  CALL compute_col_scaling
867  ictrl_prec2d = 1 !Signals funct3d (residue) to use block preconditioner
868 
869 !SPH021014: compute eigenvalues (for small enough matrices)
870 ! CALL get_eigenvalues(mblk, ns, block_mins, block_diag, block_plus)
871 
872  CALL second0(time_off)
873  WRITE (6,1001) time_off - time_on
874  WRITE (6,1001) nthreed
875 
876 ! SAVE ORIGINAL (UNFACTORED) BLOCKS FOR CHECKING INVERSE
877 ! IN L_BACKSLV=TRUE LOOP IN BLOCK_PRECOND
878  IF (l_backslv) THEN
879  ALLOCATE (block_dsave(0:ntor,0:mpol1,ntyptot,
880  & 0:ntor,0:mpol1,ntyptot,ns),
881  & block_msave(0:ntor,0:mpol1,ntyptot,
882  & 0:ntor,0:mpol1,ntyptot,ns),
883  & block_psave(0:ntor,0:mpol1,ntyptot,
884  & 0:ntor,0:mpol1,ntyptot,ns),
885  & stat = istat)
886  IF (istat .ne. 0) THEN
887  WRITE (6,*) 'Allocation error in l_backslv block: stat = ',
888  & istat
889  l_backslv = .false.
890  ELSE
891  block_dsave = block_diag; block_msave = block_mins
892  block_psave = block_plus
893  END IF
894  END IF
895 
896 !
897 ! FACTORIZE HESSIAN
898 !
899  CALL second0(time_on)
900  IF (ALLOCATED(ipiv_blk)) THEN
901  DEALLOCATE(ipiv_blk, stat=ntype)
902  END IF
903  ALLOCATE (ipiv_blk(mblk,ns), stat=ntype)
904  IF (ntype .ne. 0) stop 'Allocation error2 in block_precond'
905 
906  IF (lswap2disk) THEN
907  CALL blk3d_factor_swp(ipiv_blk, mblk, ns)
908  ELSE
909  CALL blk3d_factor(block_diag, block_mins, block_plus,
910  1 ipiv_blk, mblk, ns)
911  END IF
912 
913  CALL second0(time_off)
914  WRITE (6,1002) time_off - time_on
915  WRITE (nthreed,1002) time_off - time_on
916 
917  IF (.NOT.l_backslv) DEALLOCATE (gc_save)
918 
919  CALL second0(tprec2doff)
920 
921  timer(tprec2d) = timer(tprec2d) + (tprec2doff - tprec2don)
922 
923 1000 FORMAT(/,2x,'Initializing 2D block preconditioner at ',i5,
924  & ' iterations',
925  & /,2x,'Estimated time to compute Hessian = ',i5,
926  & ' VMEC time steps',
927  & /,2x,'Block dim: ',i4,'^2 Preconditioner size: ',f12.2,a)
928 1001 FORMAT(1x,' Time to compute blocks: ',f10.2,' s')
929 1002 FORMAT(1x,' Time to factor blocks: ',f10.2,' s')
930 
931  END SUBROUTINE compute_blocks
932 
933  SUBROUTINE sweep3_blocks(xc, xcdot, gc )
934  USE vmec_main, ONLY: ncurr, r01, z01, lthreed, chips, delt0r
935 !-----------------------------------------------
936 ! D u m m y A r g u m e n t s
937 !-----------------------------------------------
938  REAL(dp), DIMENSION(ns,0:ntor,0:mpol1,ntyptot) :: xc, xcdot,
939  & gc
940 !-----------------------------------------------
941 ! L o c a l V a r i a b l e s
942 !-----------------------------------------------
943  REAL(dp) :: eps, hj, diag_val(ns)
944  INTEGER :: js, istat, mesh, lamtype, rztype
945 
946 ! INTEGER :: m1, n1, nt1
947 
948 !-----------------------------------------------
949 !
950 ! COMPUTE EVERY 3rd RADIAL POINT FOR EACH MESH STARTING AT js=1,2,3, RESPECTIVELY
951 !
952  WRITE (6, *)
953  & " Using non-symmetric sweep to compute Hessian elements"
954 
955  eps = sqrt(epsilon(eps))
956  eps = eps/10
957  rztype = 2*ntmax
958  lamtype = rztype+1
959 
960  xcdot = 0
961  n_2d = 0; m_2d = 0
962 
963  ALLOCATE(dataitem(0:ntor,0:mpol1,3*ntmax), stat=istat)
964  IF (istat .NE. 0) THEN
965  stop 'Allocation error in sweep3_blocks'
966  END IF
967 
968  diag_val = 1
969 
970 !
971 ! CALL FUNCT3D FIRST TIME TO STORE INITIAL UN-PRECONDITIONED FORCES
972 !
973  ictrl_prec2d = 2 !Signals funct3d to initialize preconditioner (save state)
974  CALL funct3d (lscreen, istat)
975  IF (istat .ne. 0) THEN
976  print *,' ier_flag = ', istat,
977  & ' in SWEEP3_BLOCKS call to funct3d'
978  stop
979  ENDIF
980 
981 #if defined(CHI_FORCE)
982 ! STORE chips in xc
983  IF (ncurr .EQ. 1) THEN
984  xc(1:ns,0,0,lamtype) = chips(1:ns)
985  END IF
986 #endif
987  CALL restart_iter(delt0r)
988  ictrl_prec2d = 3 !Signals funct3d to compute preconditioner elements
989  gc_save = gc
990 
991 !
992 ! PERFORM R(m=0,n=0) JOG TO LOAD DIAG_VAL ARRAY
993 ! (DO NOT RELY ON IT BEING THE FIRST JOG IN LOOP)
994 !
995  m_2d = 0
996  n_2d = 0
997 #ifdef _HBANGLE
998  ntype_2d = zsc + ntmax
999 #else
1000  ntype_2d = rcc
1001 #endif
1002  edge_mesh = .false.
1003  DO mesh = 1, 3
1004  DO js = jstart(mesh), ns, 3
1005  IF (js .EQ. ns) THEN
1006  edge_mesh(mesh) = .true.
1007  END IF
1008  END DO
1009  END DO
1010 
1011 ! APPLY JOG
1012  hj = eps * max(abs(r01(ns)), abs(z01(ns)))
1013  DO js = jstart(1), ns, 3
1014  xcdot(js,n_2d,m_2d,ntype_2d) = hj
1015  END DO
1016 
1017  CALL funct3d (lscreen, istat)
1018  IF (istat .NE. 0) THEN
1019  stop 'Error computing Hessian jog!'
1020  END IF
1021 ! CLEAR JOG AND STORE BLOCKS FOR THIS JOG
1022  xcdot = 0
1023  DO js = jstart(1), ns, 3
1024  dataitem = (gc(js,:,:,:) - gc_save(js,:,:,:))/hj
1025  diag_val(js) = dataitem(0,0,ntype_2d)
1026  END DO
1027  IF (diag_val(1) .EQ. zero) THEN
1028  diag_val(1) = diag_val(4)
1029  END IF
1030  IF (diag_val(ns).EQ. zero) THEN
1031  diag_val(ns) = diag_val(ns-3)
1032  END IF
1033 
1034  DO js = jstart(2), ns, 3
1035  diag_val(js) = diag_val(js-1)
1036  END DO
1037  DO js = jstart(3), ns, 3
1038  diag_val(js) = diag_val(js-1)
1039  END DO
1040 
1041  IF (any(diag_val .EQ. zero)) THEN
1042  stop 'diag_val == 0'
1043  END IF
1044 !
1045 ! PERFORM "JOGS" FOR EACH VARIABLE AT EVERY 3rd RADIAL POINT ACROSS MESH
1046 ! FOR ntype_2d = (Rcc, Rss, Rsc, Rcs, Zsc, Zcs, Zcc, Zss)
1047 ! AND EVERY n_2d (toroidal mode index) and EVERY m_2d (poloidal mode index)
1048 !
1049  ntype2d: DO ntype_2d = 1, ntyptot
1050  IF (ntype_2d .LT. lamtype) THEN
1051  hj = eps*max(abs(r01(ns)), abs(z01(ns)))
1052  ELSE
1053  hj = eps
1054  END IF
1055 
1056  m2d: DO m_2d = 0, mpol1
1057  n2d: DO n_2d = 0, ntor
1058 #ifdef _HBANGLE
1059  IF (ntype_2d.GT.ntmax .AND. ntype_2d.LE.2*ntmax) THEN
1060  IF (m_2d .NE. 0) THEN
1061  block_diag(n_2d,m_2d,ntype_2d,
1062  & n_2d,m_2d,ntype_2d,:) = diag_val
1063  cycle
1064  END IF
1065  END IF
1066 #endif
1067  mesh_3pt: DO mesh = 1,3
1068 ! APPLY JOG
1069  DO js = jstart(mesh), ns, 3
1070  xcdot(js,n_2d,m_2d,ntype_2d) = hj
1071  END DO
1072 
1073  l_edge = edge_mesh(mesh)
1074  CALL funct3d(lscreen, istat)
1075  IF (istat .NE. 0) THEN
1076  stop 'Error computing Hessian jog!'
1077  END IF
1078 
1079 ! IF (.NOT.lfreeb) !NOT NEEDED, gcr, gcz -> 0 in RESIDUE for lfreeb=F
1080 ! 1 gc(ns,:,:,1:rztype) = gc_save(ns,:,:,1:rztype)
1081 !
1082 ! COMPUTE PRECONDITIONER (HESSIAN) ELEMENTS. LINEARIZED EQUATIONS
1083 ! OF FORM (FIXED mn FOR SIMPLICITY):
1084 !
1085 ! F(j-1) = a(j-1)x(j-2) + d(j-1)x(j-1) + b(j-1)x(j)
1086 ! F(j) = a(j)x(j-1) + d(j) x(j) + b(j) x(j+1)
1087 ! F(j+1) = a(j+1)x(j) + d(j+1)x(j+1) + b(j+1)x(j+2)
1088 !
1089 ! HESSIAN IS H(k,j) == dF(k)/dx(j); aj == block_mins; dj == block_diag; bj = block_plus
1090 !
1091 ! THUS, A PERTURBATION (in xc) AT POSITION js PRODUCES THE FOLLOWING RESULTS:
1092 !
1093 ! d(js) = dF(js )/hj(js)
1094 ! b(js-1) = dF(js-1)/hj(js)
1095 ! a(js+1) = dF(js+1)/hj(js)
1096 !
1097 ! CLEAR JOG
1098  xcdot = 0
1099 !
1100 ! STORE BLOCK ELEMENTS FOR THIS JOG.
1101 ! FOR OFF-DIAGONAL ELEMENTS, NEED TO ADJUST js INDICES +/- 1
1102 !
1103  skip3_mesh: DO js = jstart(mesh), ns, 3
1104 
1105  !block_mins(js+1) == a
1106  IF (js .lt. ns) THEN
1107  dataitem = (gc(js+1,:,:,:) -
1108  & gc_save(js+1,:,:,:))/hj
1109  END IF
1110 
1111  IF (lswap2disk) THEN
1112  CALL writedaitem_seq(dataitem)
1113  ELSE IF (js .lt. ns) THEN
1114  block_mins(:,:,:,n_2d,m_2d,ntype_2d,js+1) =
1115  & dataitem
1116  END IF
1117 
1118  !block_diag(js) == d
1119  dataitem = (gc(js,:,:,:) - gc_save(js,:,:,:))/hj
1120 
1121  IF (dataitem(n_2d,m_2d,ntype_2d) .EQ. zero) THEN
1122  dataitem(n_2d,m_2d,ntype_2d) = diag_val(js)
1123  END IF
1124 
1125 !Levenberg-like offset - do NOT apply here if applied in colscaling routine
1126  IF (ntype_2d .GE. lamtype) THEN
1127  dataitem(n_2d,m_2d,ntype_2d) =
1128  & 1.0001_dp*dataitem(n_2d,m_2d,ntype_2d)
1129  END IF
1130 
1131  IF (lswap2disk) THEN
1132  CALL writedaitem_seq(dataitem)
1133  ELSE
1134  block_diag(:,:,:,n_2d,m_2d,ntype_2d,js) =
1135  & dataitem
1136  END IF
1137 
1138  !block_plus(js-1) == b
1139  IF (js .GT. 1) THEN
1140  dataitem = (gc(js-1,:,:,:) -
1141  & gc_save(js-1,:,:,:))/hj
1142  END IF
1143 !no coupling of ALL fixed bdy forces to ANY r,z bdy values
1144  IF (lswap2disk) THEN
1145  CALL writedaitem_seq(dataitem)
1146  ELSE IF (js .GT. 1) THEN
1147  block_plus(:,:,:,n_2d,m_2d,ntype_2d,js-1) =
1148  & dataitem
1149  END IF
1150 
1151  END DO skip3_mesh
1152  END DO mesh_3pt
1153  END DO n2d
1154  END DO m2d
1155  END DO ntype2d
1156 
1157  l_edge = .false.
1158  DEALLOCATE(dataitem)
1159 
1160  END SUBROUTINE sweep3_blocks
1161 
1162  SUBROUTINE blk3d_factor(a, bm1, bp1, ipiv, mblk, nblocks)
1163 C-----------------------------------------------
1164 C M o d u l e s
1165 C-----------------------------------------------
1166 C-----------------------------------------------
1167 C L o c a l P a r a m e t e r s
1168 C-----------------------------------------------
1169  REAL(sp), PARAMETER :: zero = 0, one = 1
1170 C-----------------------------------------------
1171 C D u m m y A r g u m e n t s
1172 C-----------------------------------------------
1173  INTEGER, INTENT(in) :: nblocks, mblk
1174  INTEGER, TARGET, INTENT(out) :: ipiv(mblk,nblocks)
1175  REAL(sp), TARGET, DIMENSION(mblk,mblk,nblocks), INTENT(inout) ::
1176  & a, bm1, bp1
1177 C-----------------------------------------------
1178 C L o c a l V a r i a b l e s
1179 C-----------------------------------------------
1180 ! INTEGER :: ibuph, incnow, irecl, incbu, iunit=102, ndisk
1181  INTEGER :: k, k1, ier
1182  INTEGER, POINTER :: ipivot(:)
1183  REAL(sp), POINTER :: amat(:,:), bmat(:,:), cmat(:,:)
1184  REAL(sp), ALLOCATABLE, DIMENSION(:,:) :: temp
1185 C-----------------------------------------------
1186 c modified (June, 2003, ORNL): S. P. Hirshman
1187 c-----------------------------------------------------------------------
1188 c
1189 c this subroutine solves for the Q factors of a block-tridiagonal system of equations.
1190 c
1191 c-----------------------------------------------------------------------
1192 c INPUT
1193 c mblk : block dimension (elements in a block=mblk X mblk)
1194 c nblocks : number of blocks
1195 c a : diagonal blocks
1196 c bp1, bm1 : lower, upper blocks (see equation below)
1197 c
1198 c OUTPUT
1199 c ipiv : pivot elements for kth block
1200 c a : a-1 LU factor blocks
1201 c bm1 : q = a-1 * bm1 matrix
1202 c
1203 c LOCAL VARIABLES
1204 c iunit : unit number for block-tridiagonal solution disk file.
1205 c
1206 c solutions are indexed in m-n fourier-space, legendre-space. the tri-diagonal
1207 c equation is:
1208 c
1209 c bm1 * f(l-1) + a * f(l) + bp1 * f(l+1) = source(l)
1210 c
1211 c GENERAL SOLUTION SCHEME APPLIED TO EACH BLOCK ROW (INDEX L)
1212 c
1213 c 1. Start from row N and solve for x(N) in terms of x(N-1):
1214 c
1215 c x(N) = -q(N)*x(N-1) + r(N)
1216 c
1217 c q(N) = a(N)[-1] * bm1; r(N) = a(N)[-1] * s(N)
1218 c
1219 c where a(N)[-1] is the inverse of a(N)
1220 c
1221 c 2. Substitute for lth row to get recursion equation fo q(l) and r(l):
1222 c
1223 c x(l) = -q(l)*x(l-1) + r(l), in general, where:
1224 c
1225 c q(l) = (a(l) - bp1(l)*q(l+1))[-1] * bm1(l)
1226 c
1227 c qblk(l) == (a(l) - bp1(l) * q(l+1))[-1] on return
1228 c
1229 c r(l) = (a(l) - bp1(l)*q(l+1))[-1] * (s(l) - bp1(l)*r(l+1))
1230 c
1231 c 3. At row l = 1, bm1(1) = 0 and get an equation for x(1) corresponding to q(1) = 0:
1232 c
1233 c x(1) = r(1)
1234 c
1235 c 4. Finally, can back-solve for x(N) in terms of x(N-1) from eqn.(1) above
1236 c
1237 c
1238 c NUMERICAL IMPLEMENTATION (USING LAPACK ROUTINES)
1239 c
1240 c 1. CALL dgetrf: Perform LU factorization of diagonal block (A) - faster than sgefa
1241 c 2. CALL dgetrs: With multiple (mblk) right-hand sides, to do block inversion
1242 c operation, A X = B (stores result in B; here B is a matrix)
1243 c
1244 
1245 c main loop. load and process (backwards) block-rows nblocks to 1.
1246 
1247 
1248  blocks: DO k = nblocks, 1, -1
1249 !
1250 ! Compute (and save) qblk(k) = ablk(k)[-1] * bml
1251 !
1252  amat => a(:,:,k); ipivot => ipiv(:,k)
1253  CALL dgetrf(mblk, mblk, amat, mblk, ipivot, ier)
1254  IF (ier .ne. 0) GOTO 200
1255  IF (k .eq. 1) EXIT
1256 
1257  bmat => bm1(:,:,k)
1258  CALL dgetrs('n', mblk, mblk, amat, mblk, ipivot, bmat, mblk,
1259  & ier)
1260 
1261  IF (ier .ne. 0) GOTO 305
1262 
1263 !
1264 ! Update effective diagonal "a" matrix. Use dgemm: faster AND doesn't overflow normal stack
1265 !
1266  k1 = k-1
1267  amat => bp1(:,:,k1)
1268  cmat => a(:,:,k1)
1269 ! cmat = cmat - MATMUL(amat, bmat)
1270  CALL dgemm('N','N',mblk,mblk,mblk,-one,amat,mblk,
1271  1 bmat, mblk, one, cmat, mblk)
1272 
1273  END DO blocks
1274 
1275 !
1276 ! COMPUTE TRANSPOSES HERE, SINCE REPEATEDLY CALLING MATMUL OPERATION
1277 ! X*At IS FASTER THAN A*X DUE TO UNIT STRIDE
1278 !
1279  ALLOCATE (temp(mblk,mblk), stat=k)
1280  IF (k .ne. 0) stop 'Allocation error in blk3d_factor!'
1281 
1282  DO k = 1, nblocks
1283  IF (k .ne. nblocks) THEN
1284  temp = transpose(bp1(:,:,k))
1285  bp1(:,:,k) = temp
1286  END IF
1287  IF (k .ne. 1) THEN
1288  temp = transpose(bm1(:,:,k))
1289  bm1(:,:,k) = temp
1290  END IF
1291  END DO
1292 
1293  DEALLOCATE (temp)
1294 
1295  GOTO 400
1296 
1297 c error returns. ------------------------------------------------------
1298 
1299  200 CONTINUE
1300 ! < 0: if info = -i, the i-th argument had an illegal value
1301 ! > 0: if info = i, u(i,i) is exactly zero. the factorization
1302  WRITE (6,1000) k
1303  IF (ier < 0) THEN
1304  WRITE (6,1001) ier
1305  END IF
1306  IF (ier > 0) THEN
1307  WRITE (6,1002) ier
1308  END IF
1309  stop
1310  305 CONTINUE
1311  WRITE (6, 1003) ier
1312  stop
1313 
1314 
1315  400 CONTINUE
1316 
1317 1000 FORMAT(2x,'Error factoring matrix in blk3d: block = ',i4)
1318 1001 FORMAT(i4,'th argument has illegal value')
1319 1002 FORMAT(i4,'th diagonal factor exactly zero')
1320 1003 FORMAT(2/' BLK3D: error detected: ier =',i4,2/)
1321 
1322  END SUBROUTINE blk3d_factor
1323 
1324 
1325  SUBROUTINE blk3d_factor_swp(ipiv, mblk, nblocks)
1326 C-----------------------------------------------
1327 C M o d u l e s
1328 C-----------------------------------------------
1329 C-----------------------------------------------
1330 C L o c a l P a r a m e t e r s
1331 C-----------------------------------------------
1332  REAL(sp), PARAMETER :: zero = 0, one = 1
1333 C-----------------------------------------------
1334 C D u m m y A r g u m e n t s
1335 C-----------------------------------------------
1336  INTEGER, INTENT(in) :: nblocks, mblk
1337  INTEGER, TARGET, INTENT(out) :: ipiv(mblk,nblocks)
1338 C-----------------------------------------------
1339 C L o c a l V a r i a b l e s
1340 C-----------------------------------------------
1341  REAL(sp), ALLOCATABLE, DIMENSION(:,:) ::
1342  & amat, bmat, cmat, temp
1343  INTEGER :: k, k1, ier
1344  INTEGER, POINTER :: ipivot(:)
1345 C-----------------------------------------------
1346 c modified (June, 2003, ORNL): S. P. Hirshman
1347 c modified (June, 2007, ORNL), added lswap2disk logic
1348 c-----------------------------------------------------------------------
1349 c
1350 c this subroutine solves for the Q factors of a block-tridiagonal system of equations.
1351 c see blk3d_factor for more details
1352 c
1353 c-----------------------------------------------------------------------
1354 c INPUT
1355 c mblk : block dimension (elements in a block=mblk X mblk)
1356 c nblocks : number of blocks
1357 c
1358 c OUTPUT
1359 c ipiv : pivot elements for kth block
1360 c
1361 c LOCAL VARIABLES
1362 c iunit : unit number for block-tridiagonal solution disk file.
1363 c
1364 c solutions are indexed in m-n fourier-space, legendre-space. the tri-diagonal
1365 c equation is:
1366 c
1367 c bm1 * f(l-1) + a * f(l) + bp1 * f(l+1) = source(l)
1368 c
1369 c
1370 c NUMERICAL IMPLEMENTATION (USING LAPACK ROUTINES)
1371 c
1372 c 1. CALL dgetrf: Perform LU factorization of diagonal block (A) - faster than sgefa
1373 c 2. CALL dgetrs: With multiple (mblk) right-hand sides, to do block inversion
1374 c operation, A X = B (stores result in B; here B is a matrix)
1375 c
1376 
1377 c main loop. load and process (backwards) block-rows nblocks to 1.
1378 
1379 ! CHANGE Direct Access Record length to block size (from individual rows)
1380  CALL changedafileparams(mblk**2, mblk**2, 3, scratchfile, nblocks)
1381 
1382  ALLOCATE(amat(mblk,mblk), bmat(mblk,mblk), cmat(mblk,mblk),
1383  & temp(mblk,mblk), stat=ier)
1384  IF (ier .ne. 0) stop 'Allocation error in blk3d_factor_swp!'
1385 
1386  CALL readdaitem2(temp, nblocks, bldia)
1387 
1388  blocks: DO k = nblocks, 1, -1
1389 !
1390 ! Compute (and save) qblk(k) = ablk(k)[-1] * bml
1391 !
1392  amat = temp
1393  ipivot => ipiv(:,k)
1394  CALL dgetrf(mblk, mblk, amat, mblk, ipivot, ier)
1395  IF (ier .ne. 0) GOTO 200
1396 !CONFIRM READ-WRITE ALLOWED...OK FOR DA Files!
1397  CALL writedaitem_ra(amat, k, bldia, 1)
1398 
1399  IF (k .eq. 1) EXIT
1400 
1401  CALL readdaitem2(bmat, k, blmin)
1402  CALL dgetrs('n', mblk, mblk, amat, mblk, ipivot, bmat, mblk,
1403  & ier)
1404  IF (ier .ne. 0) GOTO 305
1405 !
1406 ! COMPUTE TRANSPOSES HERE (and for cmat below), SINCE REPEATEDLY CALLING MATMUL OPERATION
1407 ! X*At IS FASTER THAN A*X DUE TO UNIT STRIDE
1408 !
1409  temp = transpose(bmat)
1410  CALL writedaitem_ra(temp, k, blmin, 1)
1411 
1412 !
1413 ! Update effective diagonal "a" matrix. Use dgemm: faster AND doesn't overflow normal stack
1414 !
1415  k1 = k-1
1416  CALL readdaitem2(amat, k1, blpls)
1417  CALL readdaitem2(temp, k1, bldia)
1418 ! temp = temp - MATMUL(amat, bmat)
1419  CALL dgemm('N','N',mblk,mblk,mblk,-one,amat,mblk, bmat, mblk,
1420  & one, temp, mblk)
1421  cmat = transpose(amat)
1422  CALL writedaitem_ra(cmat, k1, blpls, 1)
1423 
1424  END DO blocks
1425 
1426  GOTO 400
1427 
1428 c error returns. ------------------------------------------------------
1429 
1430  200 CONTINUE
1431 ! < 0: if info = -i, the i-th argument had an illegal value
1432 ! > 0: if info = i, u(i,i) is exactly zero. the factorization
1433  WRITE (6,1000) k
1434  IF (ier < 0) THEN
1435  WRITE (6,1001) ier
1436  END IF
1437  IF (ier > 0) THEN
1438  WRITE (6,1002) ier
1439  END IF
1440  stop
1441  305 CONTINUE
1442  WRITE (6, 1003) ier
1443  stop
1444 
1445  400 CONTINUE
1446 
1447  DEALLOCATE(amat, bmat, cmat, temp, stat=ier)
1448 
1449  CALL closedafile
1450 
1451 1000 FORMAT(2x,'Error factoring matrix in blk3d: block = ',i4)
1452 1001 FORMAT(i4,'th argument has illegal value')
1453 1002 FORMAT(i4,'th diagonal factor exactly zero')
1454 1003 FORMAT(2/' BLK3D: error detected: ier =',i4,2/)
1455 
1456  END SUBROUTINE blk3d_factor_swp
1457 
1458  SUBROUTINE blk3d_factor_swp2(ipiv, mblk, nblocks)
1459 C-----------------------------------------------
1460 C M o d u l e s
1461 C-----------------------------------------------
1462 C-----------------------------------------------
1463 C L o c a l P a r a m e t e r s
1464 C-----------------------------------------------
1465  REAL(sp), PARAMETER :: zero = 0, one = 1
1466 C-----------------------------------------------
1467 C D u m m y A r g u m e n t s
1468 C-----------------------------------------------
1469  INTEGER, INTENT(in) :: nblocks, mblk
1470  INTEGER, TARGET, INTENT(out) :: ipiv(mblk,nblocks)
1471 C-----------------------------------------------
1472 C L o c a l V a r i a b l e s
1473 C-----------------------------------------------
1474  REAL(sp), ALLOCATABLE, DIMENSION(:,:) ::
1475  1 amat, bmat, cmat, temp
1476  INTEGER :: k, k1, ier
1477  INTEGER, POINTER :: ipivot(:)
1478 C-----------------------------------------------
1479 c modified (June, 2003, ORNL): S. P. Hirshman
1480 c modified (June, 2007, ORNL), added lswap2disk logic
1481 c modified (July, 2007, ORNL/VA State):Helen Yang - forward sweep for speed
1482 c-----------------------------------------------------------------------
1483 c
1484 c this subroutine solves for the Q factors of a block-tridiagonal system of equations.
1485 c see blk3d_factor for more details
1486 c
1487 c-----------------------------------------------------------------------
1488 c INPUT
1489 c mblk : block dimension (elements in a block=mblk X mblk)
1490 c nblocks : number of blocks
1491 c
1492 c OUTPUT
1493 c ipiv : pivot elements for kth block
1494 c
1495 c LOCAL VARIABLES
1496 c iunit : unit number for block-tridiagonal solution disk file.
1497 c
1498 c solutions are indexed in m-n fourier-space, legendre-space. the tri-diagonal
1499 c equation is:
1500 c
1501 c bm1 * f(l-1) + a * f(l) + bp1 * f(l+1) = source(l)
1502 c
1503 c
1504 c NUMERICAL IMPLEMENTATION (USING LAPACK ROUTINES)
1505 c
1506 c 1. CALL dgetrf: Perform LU factorization of diagonal block (A) - faster than sgefa
1507 c 2. CALL dgetrs: With multiple (mblk) right-hand sides, to do block inversion
1508 c operation, A X = B (stores result in B; here B is a matrix)
1509 c
1510 
1511 c main loop. load and process (backwards) block-rows nblocks to 1.
1512 
1513 ! CHANGE Direct Access Record length to block size (from individual rows)
1514  CALL changedafileparams(mblk**2, mblk**2, 3, scratchfile, nblocks)
1515 
1516  ALLOCATE(amat(mblk,mblk), bmat(mblk,mblk), cmat(mblk,mblk),
1517  & temp(mblk,mblk), stat=ier)
1518  IF (ier .ne. 0) stop 'Allocation error in blk3d_factor_swp!'
1519 
1520  CALL readdaitem2(temp, 1, bldia)
1521 
1522  blocks: DO k = 1,nblocks
1523 !
1524 ! Compute (and save) qblk(k) = ablk(k)[-1] * bml
1525 !
1526  amat = temp
1527  ipivot => ipiv(:,k)
1528  CALL dgetrf (mblk, mblk, amat, mblk, ipivot, ier)
1529  IF (ier .ne. 0) GOTO 200
1530 !CONFIRM READ-WRITE ALLOWED...OK FOR DA Files!
1531  CALL writedaitem_ra(amat, k, bldia, 1)
1532 
1533  IF (k .eq. nblocks) EXIT
1534 
1535  CALL readdaitem2(bmat, k, blpls)
1536  CALL dgetrs('n', mblk, mblk, amat, mblk, ipivot, bmat, mblk,
1537  & ier)
1538  IF (ier .ne. 0) GOTO 305
1539 !
1540 ! COMPUTE TRANSPOSES HERE (and for cmat below), SINCE REPEATEDLY CALLING MATMUL OPERATION
1541 ! X*At IS FASTER THAN A*X DUE TO UNIT STRIDE
1542 !
1543  temp = transpose(bmat)
1544  CALL writedaitem_ra(temp, k, blpls, 1)
1545 
1546 !
1547 ! Update effective diagonal "a" matrix. Use dgemm: faster AND doesn't overflow normal stack
1548 !
1549  k1 = k + 1
1550  CALL readdaitem2(amat, k1, blmin)
1551  CALL readdaitem2(temp, k1, bldia)
1552 ! temp = temp - MATMUL(amat, bmat)
1553  CALL dgemm('N','N',mblk,mblk,mblk,-one,amat,mblk, bmat, mblk,
1554  & one, temp, mblk)
1555  cmat = transpose(amat)
1556  CALL writedaitem_ra(cmat, k1, blmin, 1)
1557 
1558  END DO blocks
1559 
1560  GOTO 400
1561 
1562 c error returns. ------------------------------------------------------
1563 
1564  200 CONTINUE
1565 ! < 0: if info = -i, the i-th argument had an illegal value
1566 ! > 0: if info = i, u(i,i) is exactly zero. the factorization
1567  WRITE (6,1000) k
1568  IF (ier < 0) THEN
1569  WRITE (6,1001) ier
1570  END IF
1571  IF (ier > 0) THEN
1572  WRITE (6,1002) ier
1573  END IF
1574  stop
1575  305 CONTINUE
1576  WRITE (6,1003) ier
1577  stop
1578 
1579  400 CONTINUE
1580 
1581  DEALLOCATE(amat, bmat, cmat, temp, stat=ier)
1582 
1583  CALL closedafile
1584 
1585 1000 FORMAT(2x,'Error factoring matrix in blk3d: block = ',i4)
1586 1001 FORMAT(i4,'th argument has illegal value')
1587 1002 FORMAT(i4,'th diagonal factor exactly zero')
1588 1003 FORMAT(2/' BLK3D: error detected: ier =',i4,2/)
1589 
1590  END SUBROUTINE blk3d_factor_swp2
1591 
1592 
1593  SUBROUTINE blk3d_slv(ablk, qblk, bp1, source,
1594  1 ipiv, mblk, nblocks)
1595 C-----------------------------------------------
1596 C M o d u l e s
1597 C-----------------------------------------------
1598  USE stel_kinds
1599 ! USE safe_open_mod
1600 C-----------------------------------------------
1601 C D u m m y A r g u m e n t s
1602 C-----------------------------------------------
1603  INTEGER, INTENT(in) :: nblocks, mblk
1604  INTEGER, TARGET, INTENT(in) :: ipiv(mblk,nblocks)
1605  REAL(sp), TARGET, DIMENSION(mblk,mblk,nblocks), INTENT(in) ::
1606  1 ablk, qblk, bp1
1607  REAL(dp), DIMENSION(mblk,nblocks), INTENT(inout)
1608  1 :: source
1609 C-----------------------------------------------
1610 C L o c a l V a r i a b l e s
1611 C-----------------------------------------------
1612  INTEGER, POINTER :: ipivot(:)
1613  INTEGER :: k, ier
1614  REAL(sp), POINTER :: amat(:,:) !, x1(:), y1(:)
1615  REAL(sp) :: source_sp(mblk)
1616 C-----------------------------------------------
1617 c modified (June, 2003, ORNL): S. P. Hirshman
1618 c-----------------------------------------------------------------------
1619 c
1620 c this subroutine solves a block-tridiagonal system of equations, using
1621 c the ABLK, QBLK factors from blk3d_factor,
1622 c
1623 c-----------------------------------------------------------------------
1624 c INPUT
1625 c mblk : block size
1626 c nblocks : number of blocks
1627 c bp1 : upper blocks (see equation below)
1628 c ipiv : pivot elements for kth block
1629 c ablk : a-1 blocks
1630 c qblk : q = a-1 * bm1
1631 c source : input right side
1632 c
1633 c OUTPUT
1634 c source : Solution x of A x = source
1635 c
1636 c LOCAL VARIABLES
1637 c iunit : unit number for block-tridiagonal solution disk file.
1638 c
1639 c solutions are indexed in m-n fourier-space, legendre-space. the tri-diagonal
1640 c equation is:
1641 c
1642 c bm1 * f(l-1) + a * f(l) + bp1 * f(l+1) = source(l)
1643 c
1644 c GENERAL SOLUTION SCHEME APPLIED TO EACH BLOCK ROW (INDEX L)
1645 c
1646 c 1. Start from row N and solve for x(N) in terms of x(N-1):
1647 c
1648 c x(N) = -q(N)*x(N-1) + r(N)
1649 c
1650 c q(N) = a(N)[-1] * bm1; r(N) = a(N)[-1] * s(N)
1651 c
1652 c where a(N)[-1] is the inverse of a(N)
1653 c
1654 c 2. Substitute for lth row to get recursion equation fo q(l) and r(l):
1655 c
1656 c x(l) = -q(l)*x(l-1) + r(l), in general, where:
1657 c
1658 c q(l) = (a(l) - bp1(l)*q(l+1))[-1] * bm1(l)
1659 c
1660 c qblk(l) == (a(l) - bp1(l) * q(l+1))[-1] on return
1661 c
1662 c r(l) = (a(l) - bp1(l)*q(l+1))[-1] * (s(l) - bp1(l)*r(l+1))
1663 c
1664 c 3. At row l = 1, bm1(1) = 0 and get an equation for x(1) corresponding to q(1) = 0:
1665 c
1666 c x(1) = r(1)
1667 c
1668 c 4. Finally, can back-solve for x(N) in terms of x(N-1) from eqn.(1) above
1669 c
1670 c
1671 c NUMERICAL IMPLEMENTATION (USING LAPACK ROUTINES)
1672 c
1673 c 1. CALL dgetrs: With single right hand side (source) to solve A x = b (b a vector)
1674 c Faster than dgesl
1675 ! ndisk = mblk*mblk
1676 
1677 c main loop. load and process (backwards) block-rows nblocks to 1.
1678 ! note: about equal time is spent in calling dgetrs and in performing
1679 ! the two loop sums: on ibm-pc, 2 s (trs) vs 3 s (sums); on linux (logjam),
1680 ! 2.4 s (trs) vs 3 s (sums).
1681 
1682 !
1683 ! Back-solve for modified sources first
1684 !
1685  blocks: DO k = nblocks, 1, -1
1686 
1687  source_sp = source(:,k)
1688  ipivot => ipiv(:,k); amat => ablk(:,:,k)
1689  CALL dgetrs('n', mblk, 1, amat, mblk,
1690  1 ipivot, source_sp, mblk, ier)
1691  source(:,k) = source_sp
1692 
1693  IF (ier .ne. 0) GOTO 305
1694  IF (k .eq. 1) EXIT
1695 
1696 !
1697 ! NOTE: IN BLK3D_FACTOR, BP1 AND BM1 WERE TRANSPOSED (AND STORED)
1698 ! TO MAKE FIRST INDEX FASTEST VARYING IN THE FOLLOWING MATMUL OPS
1699 !
1700  amat => bp1(:,:,k-1)
1701  source(:,k-1) = source(:,k-1) - matmul(source(:,k),amat) !USE THIS FORM IF TRANSPOSED bp1
1702 ! source(:,k-1) = source(:,k-1) - MATMUL(amat,source(:,k)) !UNTRANSPOSED FORM
1703 ! x1 => source(:,k); y1 => source(:,k-1)
1704 ! CALL dgemv('T',mblk,mblk,-one,amat,mblk,x1,1,
1705 ! 1 one,y1,1)
1706 
1707  END DO blocks
1708 !
1709 ! forward (back-substitution) solution sweep for block-rows k = 2 to nblocks
1710 ! now, source contains the solution vector
1711 !
1712  DO k = 2, nblocks
1713  amat => qblk(:,:,k)
1714  source(:,k) = source(:,k) - matmul(source(:,k-1),amat) !USE THIS FORM IF TRANSPOSED qblk
1715 ! source(:,k) = source(:,k) - MATMUL(amat,source(:,k-1)) !UNTRANSPOSED FORM
1716 ! x1 => source(:,k-1); y1 => source(:,k)
1717 ! CALL dgemv('T',mblk,mblk,-one,amat,mblk,x1,1,
1718 ! 1 one,y1,1)
1719 
1720  END DO
1721 
1722  GOTO 400
1723 
1724 c error returns. ------------------------------------------------------
1725 
1726  305 CONTINUE
1727  WRITE (6, '(2/a,i4,2/)') ' BLK3D: error detected: ier =',
1728  1 ier
1729  stop
1730 
1731  400 CONTINUE
1732 
1733  END SUBROUTINE blk3d_slv
1734 
1735 
1736  SUBROUTINE blk3d_slv_swp(source, ipiv, mblk, nblocks)
1737 C-----------------------------------------------
1738 C M o d u l e s
1739 C-----------------------------------------------
1740  USE stel_kinds
1741 ! USE safe_open_mod
1742 C-----------------------------------------------
1743 C D u m m y A r g u m e n t s
1744 C-----------------------------------------------
1745  INTEGER, INTENT(in) :: nblocks, mblk
1746  INTEGER, TARGET, INTENT(in) :: ipiv(mblk,nblocks)
1747  REAL(dp), DIMENSION(mblk,nblocks), INTENT(inout)
1748  1 :: source
1749 C-----------------------------------------------
1750 C L o c a l V a r i a b l e s
1751 C-----------------------------------------------
1752  REAL(sp), ALLOCATABLE, DIMENSION(:,:) :: amat
1753  INTEGER, POINTER :: ipivot(:)
1754  INTEGER :: k, k1, ier
1755  REAL(sp) :: source_sp(mblk)
1756 C-----------------------------------------------
1757 c modified (June, 2003, ORNL): S. P. Hirshman
1758 c modified (June, 2007, ORNL), added lswap2disk logic
1759 c-----------------------------------------------------------------------
1760 c
1761 c this subroutine solves a block-tridiagonal system of equations, using
1762 c the ABLK, QBLK factors from blk3d_factor,
1763 c See blk3d_slv for details
1764 c
1765 c-----------------------------------------------------------------------
1766 c INPUT
1767 c mblk : block size
1768 c nblocks : number of blocks
1769 c ipiv : pivot elements for kth block
1770 c source : input right side
1771 c
1772 c OUTPUT
1773 c source : Solution x of A x = source
1774 c
1775 c LOCAL VARIABLES
1776 c iunit : unit number for block-tridiagonal solution disk file.
1777 c
1778 c the tri-diagonal equation is:
1779 c
1780 c bm1 * f(l-1) + a * f(l) + bp1 * f(l+1) = source(l)
1781 c
1782 
1783 c main loop. load and process (backwards) block-rows nblocks to 1.
1784 ! note: about equal time is spent in calling dgetrs and in performing
1785 ! the two loop sums: on ibm-pc, 2 s (trs) vs 3 s (sums); on linux (logjam),
1786 ! 2.4 s (trs) vs 3 s (sums).
1787 
1788  CALL opendafile(mblk**2, mblk**2, 3, scratchfile, iunit_dacess, 1)
1789 
1790  ALLOCATE (amat(mblk,mblk), stat=ier)
1791  IF (ier .ne. 0) stop 'Allocation error in blk3d_slv_swp!'
1792 !
1793 ! Back-solve for modified sources first
1794 !
1795  blocks: DO k = nblocks, 1, -1
1796 
1797  source_sp = source(:,k)
1798  ipivot => ipiv(:,k)
1799  CALL readdaitem2(amat, k, bldia)
1800  CALL dgetrs('n', mblk, 1, amat, mblk,
1801  1 ipivot, source_sp, mblk, ier)
1802  source(:,k) = source_sp
1803 
1804  IF (ier .ne. 0) GOTO 305
1805  IF (k .eq. 1) EXIT
1806 
1807 !
1808 ! NOTE: IN BLK3D_FACTOR, BP1 AND BM1 WERE TRANSPOSED (AND STORED)
1809 ! TO MAKE FIRST INDEX FASTEST VARYING IN THE FOLLOWING MATMUL OPS
1810 !
1811  k1 = k-1
1812  CALL readdaitem2(amat, k1, blpls)
1813  source(:,k1) = source(:,k1) - matmul(source(:,k),amat) !USE THIS FORM IF TRANSPOSED bp1
1814 ! source(:,k1) = source(:,k1) - MATMUL(amat,source(:,k)) !UNTRANSPOSED FORM
1815 ! x1 => source(:,k); y1 => source(:,k-1)
1816 ! CALL dgemv('T',mblk,mblk,-one,amat,mblk,x1,1,
1817 ! 1 one,y1,1)
1818 
1819  END DO blocks
1820 !
1821 ! forward (back-substitution) solution sweep for block-rows k = 2 to nblocks
1822 ! now, source contains the solution vector
1823 !
1824  DO k = 2, nblocks
1825 
1826  CALL readdaitem2(amat, k, blmin)
1827  source(:,k) = source(:,k) - matmul(source(:,k-1),amat) !USE THIS FORM IF TRANSPOSED qblk
1828 ! source(:,k) = source(:,k) - MATMUL(amat,source(:,k-1)) !UNTRANSPOSED FORM
1829 ! x1 => source(:,k-1); y1 => source(:,k)
1830 ! CALL dgemv('T',mblk,mblk,-one,amat,mblk,x1,1,
1831 ! 1 one,y1,1)
1832 
1833  END DO
1834 
1835  WRITE(100,'(1p,6e14.4)') source(:,ns/2)
1836 
1837  GOTO 400
1838 
1839 c error returns. ------------------------------------------------------
1840 
1841  305 CONTINUE
1842  WRITE (6, '(2/a,i4,2/)') ' BLK3D: error detected: ier =',
1843  1 ier
1844  stop
1845 
1846  400 CONTINUE
1847 
1848  CALL closedafile
1849 
1850  END SUBROUTINE blk3d_slv_swp
1851 
1852 
1853  SUBROUTINE blk3d_slv_swp2(source, ipiv, mblk, nblocks)
1854 C-----------------------------------------------
1855 C M o d u l e s
1856 C-----------------------------------------------
1857  USE stel_kinds
1858 ! USE safe_open_mod
1859 C-----------------------------------------------
1860 C D u m m y A r g u m e n t s
1861 C-----------------------------------------------
1862  INTEGER, INTENT(in) :: nblocks, mblk
1863  INTEGER, TARGET, INTENT(in) :: ipiv(mblk,nblocks)
1864  REAL(dp), DIMENSION(mblk,nblocks), INTENT(inout)
1865  1 :: source
1866 C-----------------------------------------------
1867 C L o c a l V a r i a b l e s
1868 C-----------------------------------------------
1869  REAL(sp), ALLOCATABLE, DIMENSION(:,:) :: amat
1870  INTEGER, POINTER :: ipivot(:)
1871  INTEGER :: k, k1, ier
1872  REAL(sp) :: source_sp(mblk)
1873 C-----------------------------------------------
1874 c modified (June, 2003, ORNL): S. P. Hirshman
1875 c modified (June, 2007, ORNL), added lswap2disk logic
1876 c modified (July, 2007, ORNL/VA State):Helen Yang - forward sweep for speed
1877 c-----------------------------------------------------------------------
1878 c
1879 c this subroutine solves a block-tridiagonal system of equations, using
1880 c the ABLK, QBLK factors from blk3d_factor,
1881 c See blk3d_slv for details
1882 c
1883 c-----------------------------------------------------------------------
1884 c INPUT
1885 c mblk : block size
1886 c nblocks : number of blocks
1887 c bp1 : upper blocks (see equation below)
1888 c ipiv : pivot elements for kth block
1889 c ablk : a-1 blocks
1890 c qblk : q = a-1 * bm1
1891 c source : input right side
1892 c
1893 c OUTPUT
1894 c source : Solution x of A x = source
1895 c
1896 c LOCAL VARIABLES
1897 c iunit : unit number for block-tridiagonal solution disk file.
1898 c
1899 c the tri-diagonal equation is:
1900 c
1901 c bm1 * f(l-1) + a * f(l) + bp1 * f(l+1) = source(l)
1902 c
1903 
1904 c main loop. load and process (backwards) block-rows nblocks to 1.
1905 ! note: about equal time is spent in calling dgetrs and in performing
1906 ! the two loop sums: on ibm-pc, 2 s (trs) vs 3 s (sums); on linux (logjam),
1907 ! 2.4 s (trs) vs 3 s (sums).
1908 
1909  CALL opendafile(mblk**2, mblk**2, 3, scratchfile, iunit_dacess, 1)
1910 
1911  ALLOCATE (amat(mblk,mblk), stat=ier)
1912  IF (ier .ne. 0) stop 'Allocation error in blk3d_slv_swp!'
1913 !
1914 ! Back-solve for modified sources first
1915 !
1916  blocks: DO k = 1, nblocks
1917 
1918  source_sp = source(:,k)
1919  ipivot => ipiv(:,k)
1920  CALL readdaitem2(amat, k, bldia)
1921  CALL dgetrs('n', mblk, 1, amat, mblk,
1922  1 ipivot, source_sp, mblk, ier)
1923  source(:,k) = source_sp
1924 
1925  IF (ier .ne. 0) GOTO 305
1926  IF (k .eq. nblocks) EXIT
1927 
1928 !
1929 ! NOTE: IN BLK3D_FACTOR, BP1 AND BM1 WERE TRANSPOSED (AND STORED)
1930 ! TO MAKE FIRST INDEX FASTEST VARYING IN THE FOLLOWING MATMUL OPS
1931 !
1932  k1 = k+1
1933  CALL readdaitem2(amat, k1, blmin)
1934  source(:,k1) = source(:,k1) - matmul(source(:,k),amat) !USE THIS FORM IF TRANSPOSED bp1
1935 
1936  END DO blocks
1937 !
1938 ! backward solution sweep for block-rows k = nblocks-1 to 1
1939 ! now, source contains the solution vector
1940 !
1941  DO k = nblocks-1, 1, -1
1942 
1943  CALL readdaitem2(amat, k, blpls)
1944  k1 = k+1
1945  source(:,k) = source(:,k) - matmul(source(:,k1),amat) !USE THIS FORM IF TRANSPOSED qblk
1946 
1947  END DO
1948 
1949  WRITE(100,'(1p,6e14.4)') source(:,ns/2)
1950 
1951  GOTO 400
1952 
1953 c error returns. ------------------------------------------------------
1954 
1955  305 CONTINUE
1956  WRITE (6, '(2/a,i4,2/)') ' BLK3D: error detected: ier =',
1957  1 ier
1958  stop
1959 
1960  400 CONTINUE
1961 
1962  CALL closedafile
1963 
1964  END SUBROUTINE blk3d_slv_swp2
1965 
1966 
1967  SUBROUTINE compute_col_scaling_par
1968  USE xstuff, ONLY: pcol_scale
1969  USE blocktridiagonalsolver, ONLY: getcolsum, parallelscaling
1970  USE parallel_vmec_module, ONLY: tolastntype, copylastntype
1971 C-----------------------------------------------
1972 C L o c a l V a r i a b l e s
1973 C-----------------------------------------------
1974  INTEGER :: nsmin, nsmax
1975  REAL(dp), ALLOCATABLE :: tmp(:)
1976  REAL(dp), ALLOCATABLE :: colsum(:,:)
1977  REAL(dp), PARAMETER :: levmarq_param = 1.e-6_dp
1978 C-----------------------------------------------
1979 
1980 !FOR NO COL SCALING - col-scaling not working well yet (8.1.17)
1981  pcol_scale = 1
1982  RETURN
1983 
1984 !BE SURE TO TURN OFF LEV_MARQ SCALING IN SUBROUTINE sweep3_blocks_par
1985  nsmin = tlglob; nsmax = trglob
1986 
1987  ALLOCATE (colsum(mblk_size,nsmin:nsmax))
1988  CALL getcolsum(colsum)
1989  CALL vectorcopypar (colsum, pcol_scale)
1990  CALL parallelscaling(levmarq_param,colsum)
1991 
1992  DEALLOCATE(colsum)
1993 
1994 !Convert to internal PARVMEC format
1995  ALLOCATE (tmp(ntmaxblocksize*ns))
1996  CALL tolastntype(pcol_scale,tmp)
1997  CALL copylastntype(tmp,pcol_scale)
1998  DEALLOCATE(tmp)
1999 
2000  END SUBROUTINE compute_col_scaling_par
2001 
2002  SUBROUTINE compute_col_scaling
2003  USE xstuff, ONLY: col_scale
2004 C-----------------------------------------------
2005 C L o c a l V a r i a b l e s
2006 C-----------------------------------------------
2007 
2008 !FOR NO COL SCALING
2009  col_scale = 1
2010 
2011  END SUBROUTINE compute_col_scaling
2012 
2013  SUBROUTINE vectorcopypar (colsum, colscale)
2014  USE blocktridiagonalsolver, ONLY: rank
2015 !-----------------------------------------------
2016 ! D u m m y A r g u m e n t s
2017 !-----------------------------------------------
2018  REAL(dp), INTENT(IN) :: colsum(mblk_size,tlglob:trglob)
2019  REAL(dp), INTENT(OUT) :: colscale(mblk_size,ns)
2020 
2021 !-----------------------------------------------
2022 ! L o c a l V a r i a b l e s
2023 !-----------------------------------------------
2024  INTEGER :: js, M1
2025  INTEGER :: MPI_STAT(MPI_STATUS_SIZE)
2026 !-----------------------------------------------
2027 
2028  DO js = tlglob, trglob
2029  colscale(:,js) = colsum(:,js)
2030  END DO
2031 
2032  m1 = mblk_size
2033 
2034 ! Get left boundary elements (tlglob-1)
2035  IF (rank.LT.nranks-1) THEN
2036  CALL mpi_send(colsum(:,trglob),m1,mpi_real8,
2037  1 rank+1,1,ns_comm,mpi_err)
2038  END IF
2039  IF (rank.GT.0) THEN
2040  CALL mpi_recv(colscale(:,tlglob-1),m1,
2041  1 mpi_real8,rank-1,1,ns_comm,mpi_stat,mpi_err)
2042  END IF
2043 
2044 ! Get right boundary elements (trglob+1)
2045  IF (rank.GT.0) THEN
2046  CALL mpi_send(colsum(:,tlglob),m1,mpi_real8,
2047  1 rank-1,1,ns_comm,mpi_err)
2048  END IF
2049  IF (rank.LT.nranks-1) THEN
2050  CALL mpi_recv(colscale(:,trglob+1),m1,mpi_real8,
2051  1 rank+1,1,ns_comm,mpi_stat,mpi_err)
2052  END IF
2053 
2054  END SUBROUTINE vectorcopypar
2055 
2056 
2057  SUBROUTINE free_mem_precon
2058  INTEGER :: istat
2059 
2060  istat=0
2061  IF (ALLOCATED(block_diag))
2062  1 DEALLOCATE (block_diag, block_plus, block_mins, stat=istat)
2063  IF (istat .ne. 0) stop 'Deallocation error-1 in free_mem_precon'
2064 
2065  istat=0
2066  IF (ALLOCATED(block_diag_sw))
2067  1 DEALLOCATE (block_diag_sw, block_plus_sw, block_mins_sw,
2068  2 stat=istat)
2069  IF (istat .ne. 0) stop 'Deallocation error-2 in free_mem_precon'
2070 
2071  istat=0
2072  IF (ALLOCATED(ipiv_blk)) DEALLOCATE (ipiv_blk, stat=istat)
2073  IF (istat .ne. 0) stop 'Deallocation error-3 in free_mem_precon'
2074 
2075  END SUBROUTINE free_mem_precon
2076 
2077  END MODULE precon2d
blocktridiagonalsolver
Solver for block tri-diagonal matrices. [Kalyan S. Perumalla, ORNL, 2009-2011].
Definition: blocktridiagonalsolver.f90:8
parallel_vmec_module::copylastntype
Definition: parallel_vmec_module.f90:76