!=======================================================================
!
!  3D FFT module package
!
!    Written 4/24/02 D. A. Steck
!    Updated 7/18/03
!
!    Subroutines provided:
!
!      1. fft3d -> perform 3D FFT, output overwrites input
!           call as: "call fft3d(integer fftsign, complex(wp) data)"
!
!      2. fft3dt -> perform 3D FFT, with option to accept output in
!           transposed form to save communication (a second inverse
!           call undoes the transpose)
!           call as: 
!           "call fft3d(integer fftsign, complex(wp) datain, 
!                                        complex(wp) dataout)"
!           where the first and second dimensions of dataout are
!           swapped with respect to datain.  Note that the
!           datain array is disturbed by the computation.
!
!    The 3D array is distributed in blocks only along the last 
!      dimension, so it is most efficient if the calling program 
!      also uses this blocking.
!
!    Uses internal 1-D fft routine.
!    Also requires the integer parameter 'wp' from the module
!      'globals' to define the KIND for the real and complex
!       variables.
!
!=======================================================================


module fft3dlib

  use globals, only : wp

  private
  public :: fft3d, fft3dt

  contains


subroutine fft3d(fftsign, x)

  implicit none

  integer, intent(in) :: fftsign
  complex(kind=wp), dimension(:,:,:), intent(inout) :: x

!hpf$ distribute x(*,*,block)

  complex(kind=wp), dimension(size(x,3),size(x,2),size(x,1)) :: xtemp
  integer :: i, j, k, n1, n2, n3

!hpf$ align xtemp(:,:,:) with x(:,:,:)

  n1 = size(x, 1)
  n2 = size(x, 2)
  n3 = size(x, 3)

! use fft3dt and then undo transpose
  call fft3dt(fftsign, x, xtemp)
  forall ( i = 1:n1, j = 1:n2, k = 1:n3 )  x(i,j,k) = xtemp(k,j,i)

end subroutine fft3d


subroutine fft3dt(fftsign, xin, xout)

  implicit none

  integer, intent(in) :: fftsign
  complex(kind=wp), dimension(:,:,:), intent(inout) :: xin
  complex(kind=wp), dimension(:,:,:), intent(out) :: xout

!hpf$ distribute xin(*,*,block) 
!hpf$ distribute xout(*,*,block)

  ! coefficient array for calls to ffts
  complex(kind=wp), dimension(:,:), allocatable, save :: coeff

!hpf$ distribute coeff(*,*) 


  integer :: n1, n2, n3
  integer :: n, log2n, log2n1, log2n2, log2n3
  integer :: m, j, k, l
  real(kind=wp), parameter :: pi = 3.141592653589793238_wp
  complex(kind=wp), parameter :: i = (0.0_wp, 1.0_wp)
  integer, save :: nold = 0

  n1 = size(xin, 1)
  n2 = size(xin, 2)
  n3 = size(xin, 3)

  if ( n3 .ne. size(xout,1) .or. &
       n2 .ne. size(xout,2) .or. n1 .ne. size(xout,3) ) then
    write(0,*) 'Error (FFT3DT): input and output array sizes do not match'
    stop
  end if

! initialize coefficient array on first call, or if fft is larger than
!   on all previous fft calls
!   This array is the U array in Bailey's paper, which stores
!   the roots of unity for stride-1 access.
! This part moved out of the usual ffts module for purity of the
!   ffts1d subroutine.
  n = max(n1, n2, n3)
  log2n1 = int( log(real(n1,kind=wp))/log(2.0_wp) + 0.49 )
  log2n2 = int( log(real(n2,kind=wp))/log(2.0_wp) + 0.49 )
  log2n3 = int( log(real(n3,kind=wp))/log(2.0_wp) + 0.49 )
  if ( iand(n1, n1-1) .ne. 0 ) then
    write(0,*) "Error (FFT3DT): first array dimension not power of 2"
    stop
  end if
  if ( iand(n2, n2-1) .ne. 0 ) then
    write(0,*) "Error (FFT3DT): second array dimension not power of 2"
    stop
  end if
  if ( iand(n3, n3-1) .ne. 0 ) then
    write(0,*) "Error (FFT3DT): third array dimension not power of 2"
    stop
  end if
  log2n = max(log2n1, log2n2, log2n3)
  if ( nold .lt. n ) then
    nold = n
    if ( allocated(coeff) ) deallocate(coeff)
    allocate(coeff(2*n, 2))
    do m = 0, log2n
      do j = 0, 2**m-1
        coeff(2**m+j,1) = exp(i*j*pi/(2**m))
      end do
    end do
    coeff(:,2) = conjg(coeff(:,1))
  end if


! transform one local dimension
!hpf$ independent
  do k = 1, n3
!hpf$ independent
    do j = 1, n2
      call ffts1d( fftsign, n1, log2n1, xin(:,j,k), coeff )
    end do
  end do

! copy to transposed output array so that the remaining 2 dimensions are local
  forall ( j = 1:n1, k = 1:n2, l = 1:n3 )  xout(l,k,j) = xin(j,k,l)

! transform the unit-stride local dimension
!hpf$ independent
  do k = 1, n1
!hpf$ independent
    do j = 1, n2
      call ffts1d( fftsign, n3, log2n3, xout(:,j,k), coeff )
    end do
  end do

! transform the remaining local dimension
!hpf$ independent
  do k = 1, n1
!hpf$ independent
    do j = 1, n3
      call ffts1d( fftsign, n2, log2n2, xout(j,:,k), coeff )
    end do
  end do

  contains


  pure subroutine ffts1d(fftsign, n, log2n, data, coeff)
  
    implicit none
  
    integer, intent(in) :: fftsign, n, log2n
    complex(kind=wp), dimension(:), intent(inout) :: data
    complex(kind=wp), dimension(:,:), intent(in) :: coeff
  
    complex(kind=wp), dimension(1:size(data)) :: work
    integer ls, ns, l, j, m
  

  ! divide cases according to whether log2n is even or odd, and "ping-pong"
  !   the computation between the data and work arrays
    if ( iand(log2n, 1) .eq. 1 ) then
      do l = 1, log2n-1, 2
        ls = 2**(l-1)
        ns = n/(ls+ls)
        call stockv1(fftsign, ls, ns, data, work, coeff)
        call stockv1(fftsign, ls*2, ns/2, work, data, coeff)
      end do
      ls = 2**(log2n-1)
      ns = n/(ls+ls)
      call stockv1(fftsign, ls, ns, data, work, coeff)
      data = work
    else
      do l = 1, log2n, 2
        ls = 2**(l-1)
        ns = n/(ls+ls)
        call stockv1(fftsign, ls, ns, data, work, coeff)
        call stockv1(fftsign, ls*2, ns/2, work, data, coeff)
      end do
    end if
  
  end subroutine ffts1d
  

  pure subroutine stockv1(fftsign, ls, ns, c, ch, coeff)
  
    implicit none
  
    integer, intent(in) :: fftsign, ls, ns
    complex(kind=wp), intent(in), dimension(ls, ns, *) :: c
    complex(kind=wp), intent(out), dimension(ls, 2, *) :: ch
    complex(kind=wp), intent(in), dimension(:,:) :: coeff
  
    complex(kind=wp) :: omegk, wyk
    integer :: j, k, spt
  
    if ( fftsign .eq. 1 ) then
      spt = 1
    else
      spt = 2
    end if
  
    do j = 1, ns
      ch(1,1,j) = c(1,j,1) + c(1,j,2)
      ch(1,2,j) = c(1,j,1) - c(1,j,2)
      do k = 2, ls
        omegk = coeff(ls+k-1,spt)
        wyk = omegk * c(k,j,2)
        ch(k,1,j) = c(k,j,1) + wyk
        ch(k,2,j) = c(k,j,1) - wyk
      end do
    end do
  
  end subroutine stockv1

  
end subroutine fft3dt
  
end module fft3dlib