!=======================================================================
!
!  2D FFT module package
!
!    Written 4/24/02 D. A. Steck
!    Based on code by S. Habib.
!
!    Subroutines provided:
!
!      1. fft2d -> perform 2D FFT, output overwrites input
!           call as: "call fft2d(integer fftsign, complex(wp) data)"
!
!      2. fft2dt -> perform 2D FFT, with option to accept output in
!           transposed form to save communication (a second inverse
!           call undoes the transpose)
!           call as: 
!           "call fft2d(integer fftsign, complex(wp) datain, 
!                                        complex(wp) dataout)"
!           where the first and second dimensions of dataout are
!           swapped with respect to datain.  Note that the
!           datain array is disturbed by the computation.
!
!    The 2D array is distributed in blocks only along the last 
!      dimension, so it is most efficient if the calling program 
!      also uses this blocking.
!
!    Uses internal 1-D fft routine.
!    Also requires the integer parameter 'wp' from the module
!      'globals' to define the KIND for the real and complex
!       variables.
!
!=======================================================================


module fft2dlib

  use globals, only : wp

  private
  public :: fft2d, fft2dt

  contains


subroutine fft2d(fftsign, x)

  implicit none

  integer, intent(in) :: fftsign
  complex(kind=wp), dimension(:,:), intent(inout) :: x

!hpf$ distribute x(*,block)

  complex(kind=wp), dimension(size(x,2),size(x,1)) :: xtemp
  integer :: i, j, n1, n2

!hpf$ align xtemp(:,:) with x(:,:)

  n1 = size(x, 1)
  n2 = size(x, 2)

! use fft2dt and then undo transpose
  call fft2dt(fftsign, x, xtemp)
  x = transpose(xtemp)

end subroutine fft2d


subroutine fft2dt(fftsign, xin, xout)

  implicit none

  integer, intent(in) :: fftsign
  complex(kind=wp), dimension(:,:), intent(inout) :: xin
  complex(kind=wp), dimension(:,:), intent(out) :: xout

!hpf$ distribute xin(*,block) 
!hpf$ distribute xout(*,block)

  ! coefficient array for calls to ffts
  complex(kind=wp), dimension(:,:), allocatable, save :: coeff

!hpf$ distribute coeff(*,*) 


  integer :: n1, n2
  integer :: n, log2n, log2n1, log2n2
  integer :: m, j, k
  real(kind=wp), parameter :: pi = 3.141592653589793238_wp
  complex(kind=wp), parameter :: i = (0.0_wp, 1.0_wp)
  integer, save :: nold = 0

  n1 = size(xin, 1)
  n2 = size(xin, 2)

! check array sizes
  if ( n2 .ne. size(xout,1) .or. n1 .ne. size(xout,2) ) then
    write(0,*) 'Error (FFT2DT): input and output array sizes do not match'
    stop
  end if

! initialize coefficient array on first call, or if fft is larger than
!   on all previous fft calls
!   This array is the U array in Bailey's paper, which stores
!   the roots of unity for stride-1 access.
! This part moved out of the usual ffts module for purity of the
!   ffts1d subroutine.
  n = max(n1, n2)
  log2n1 = int( log(real(n1,kind=wp))/log(2.0_wp) + 0.49 )
  log2n2 = int( log(real(n2,kind=wp))/log(2.0_wp) + 0.49 )
  if ( iand(n1, n1-1) .ne. 0 ) then
    write(0,*) "Error (FFT2DT): first array dimension not power of 2"
    stop
  end if
  if ( iand(n2, n2-1) .ne. 0 ) then
    write(0,*) "Error (FFT2DT): second array dimension not power of 2"
    stop
  end if
  log2n = max(log2n1, log2n2)
  if ( nold .lt. n ) then
    nold = n
    if ( allocated(coeff) ) deallocate(coeff)
    allocate(coeff(2*n, 2))
    do m = 0, log2n
      do j = 0, 2**m-1
        coeff(2**m+j,1) = exp(i*j*pi/(2**m))
      end do
    end do
    coeff(:,2) = conjg(coeff(:,1))
  end if


! transform local dimension
!hpf$ independent
  do k = 1, n2
    call ffts1d( fftsign, n1, log2n1, xin(:,k), coeff )
  end do

! copy to transposed output array so that the remaining dimension is local
  xout = transpose(xin)

! transform the local dimension
!hpf$ independent
  do k = 1, n1
    call ffts1d( fftsign, n2, log2n2, xout(:,k), coeff )
  end do


  contains


  pure subroutine ffts1d(fftsign, n, log2n, data, coeff)
  
    implicit none
  
    integer, intent(in) :: fftsign, n, log2n
    complex(kind=wp), dimension(:), intent(inout) :: data
    complex(kind=wp), dimension(:,:), intent(in) :: coeff
  
    complex(kind=wp), dimension(1:size(data)) :: work
    integer ls, ns, l, j, m
  

  ! divide cases according to whether log2n is even or odd, and "ping-pong"
  !   the computation between the data and work arrays
    if ( iand(log2n, 1) .eq. 1 ) then
      do l = 1, log2n-1, 2
        ls = 2**(l-1)
        ns = n/(ls+ls)
        call stockv1(fftsign, ls, ns, data, work, coeff)
        call stockv1(fftsign, ls*2, ns/2, work, data, coeff)
      end do
      ls = 2**(log2n-1)
      ns = n/(ls+ls)
      call stockv1(fftsign, ls, ns, data, work, coeff)
      data = work
    else
      do l = 1, log2n, 2
        ls = 2**(l-1)
        ns = n/(ls+ls)
        call stockv1(fftsign, ls, ns, data, work, coeff)
        call stockv1(fftsign, ls*2, ns/2, work, data, coeff)
      end do
    end if
  
  end subroutine ffts1d
  

  pure subroutine stockv1(fftsign, ls, ns, c, ch, coeff)
  
    implicit none
  
    integer, intent(in) :: fftsign, ls, ns
    complex(kind=wp), intent(in), dimension(ls, ns, *) :: c
    complex(kind=wp), intent(out), dimension(ls, 2, *) :: ch
    complex(kind=wp), intent(in), dimension(:,:) :: coeff
  
    complex(kind=wp) :: omegk, wyk
    integer :: j, k, spt
  
    if ( fftsign .eq. 1 ) then
      spt = 1
    else
      spt = 2
    end if
  
    do j = 1, ns
      ch(1,1,j) = c(1,j,1) + c(1,j,2)
      ch(1,2,j) = c(1,j,1) - c(1,j,2)
      do k = 2, ls
        omegk = coeff(ls+k-1,spt)
        wyk = omegk * c(k,j,2)
        ch(k,1,j) = c(k,j,1) + wyk
        ch(k,2,j) = c(k,j,1) - wyk
      end do
    end do
  
  end subroutine stockv1

  
end subroutine fft2dt
  
end module fft2dlib