!#########################################################################
!		
!    Copyright (C) 2003-2012 Department of Physics and Astronomy,
!                            University of Rochester,
!                            Rochester, NY
!
!    timing.f90 is part of AstroBEAR.
!
!    AstroBEAR is free software: you can redistribute it and/or modify	  
!    it under the terms of the GNU General Public License as published by 
!    the Free Software Foundation, either version 3 of the License, or    
!    (at your option) any later version.
!
!    AstroBEAR is distributed in the hope that it will be useful, 
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU General Public License for more details.
!
!    You should have received a copy of the GNU General Public License
!    along with AstroBEAR.  If not, see <http://www.gnu.org/licenses/>.
!
!#########################################################################
MODULE Timing
   USE GlobalDeclarations
   USE HyperbolicDeclarations
   IMPLICIT NONE
   SAVE
   INTEGER, PARAMETER :: &
        iInitInfos                            = 1,  &
        iProlongateParentsData                = 2,  &
        iChildMaskOverlaps                    = 3,  &
        iUpdateOverlaps                       = 4,  &
        iApplyOverlaps                        = 5,  &
        iAfterOverlaps                        = 6,  &
        iParticleUpdate                       = 7,  &
        iApplyPhysicalBCs                     = 8,  &
        iSetErrFlags                          = 9,  &
        iAgeNodesChildren                     = 10, &
        iBackUpNodes                          = 11, &
        iCreateChildrens                      = 12, &
        iInheritOverlapsOldChildren           = 13, &
        iInheritNeighborsChildren             = 14, &
        iInheritOverlapsNewChildren           = 15, &
        iInheritOldNodeOverlapsChildren       = 16, &
        iInheritNewNodeOverlapsChildren       = 17, &
        iScheduledAdvanceGrids                = 18, &
        iAdvanceGrids                         = 19, &
        iRestrictionFixups                    = 20, &
        iAMR                                  = 21, &
        iElliptic                             = 22, &
        iPrintAdvance                         = 23, &
        iApplyChildrenData                    = 24, &
        iCompleteAdvancedGrids                = 25, &
        iUpdateChildMasks                     = 26, &
        iSyncFluxes                           = 27, &
        iAccumulateFluxes                     = 28, &
        iNullifyNeighbors                     = 29, &
        iCoarsenDataForParents                = 30, &
        iClearParentProcs                     = 31, &
        iAfterFixups                          = 32, &
        iiRecvGridsFromParents                = 33, &
        iiSendGridsToChildren                 = 34, &
        iiRecvParentsData                     = 35, & 
        iiSendChildrenData                    = 36, & 
        iiRecvOverlapsNeighbors               = 37, &
        iiSendOverlapsNeighbors               = 38, &
        iiRecvOldNodeOverlaps                 = 39, & 
        iiSendOverlapsToOldNodesChildren      = 40, &
        iiSendOverlapsToNodesOldChildren      = 42, &
        iiRecvOverlaps                        = 43, &
        iiSendOverlaps                        = 44, &
        iiRecvNeighboringChildren             = 45, &
        iiSendNeighboringChildren             = 46, &
        iiRecvOverlappingChildrenFromNewNodes = 47, & 
        iiSendOverlappingChildrenToOldNodes   = 48, &
        iiRecvOverlappingChildrenFromOldNodes = 49, &
        iiSendOverlappingChildrenToNewNodes   = 50, &
        iiRecvChildrenData                    = 51, & 
        iiSendParentsData                     = 52, &
        iiRecvFluxes                          = 53, &
        iiSendFluxes                          = 54, &
        iiRecvEllipticData                    = 55, &
        iiSendEllipticData                    = 56, &        
        iWaitingAdvances                      = 57, &
        iBackUpData                           = 58, &
        iApplyEllipticBCs                     = 59, &
        iDistributeChildrens                  = 60, &
        iProcessData                          = 61, &
        iWriteData                            = 62, &
        iBarrier                              = 63, &
        iTestBadCFL                           = 64, &
        MaxTimers                             = 64


   INTEGER, PARAMETER :: CommLow = 33
   INTEGER, PARAMETER :: CommHigh = 55
   INTEGER, PARAMETER :: StageLow = 1
   INTEGER, PARAMETER :: StageHigh= 32
   TYPE TimerDef
      REAL(8) :: LastStarted(-2:MaxDepth)
      REAL(8) :: Accumulator(-2:MaxDepth)=0
      CHARACTER(LEN=40) :: description=''      
   END TYPE TimerDef

   TYPE(TimerDef) :: Timers(MaxTimers)
   TYPE(TimerDef), PUBLIC :: AdvanceTimer
   TYPE(TimerDef), PUBLIC :: AdvancePredictor

CONTAINS

   SUBROUTINE StartTimer(index,n)
      INTEGER :: index, n
      IF (lTimingLog .AND. index /= iAMR) write(TIMER_LOG_HANDLE,*) MPI_Wtime()-InitTime, 100*(n+2)+index+MPI_ID
!      write(*,'(A,I6,A,A,A,I3,A,E15.5)') 'Processor ', MPI_ID, ' Starting ', Timers(index)%description, ' on level ', n, ' at ', MPI_Wtime()-InitTime
      Timers(index)%LastStarted(n)=MPI_Wtime()
   END SUBROUTINE StartTimer

   SUBROUTINE StopTimer(index,n)
      INTEGER :: index, n
      IF (lTimingLog .AND. index /= iAMR) write(TIMER_LOG_HANDLE,*) MPI_Wtime()-InitTime, 100*(n+2)+index+MPI_ID
!      write(*,'(A,I6,A,A,A,I3,A,E15.5)') 'Processor ', MPI_ID, ' Stopping ', Timers(index)%description, ' on level ', n, ' at ', MPI_Wtime()-InitTime
      Timers(index)%Accumulator(n)=Timers(index)%Accumulator(n)+MPi_Wtime()-Timers(index)%LastStarted(n)
   END SUBROUTINE StopTimer

   SUBROUTINE TimerInit()
      INTEGER :: i, iErr
      CHARACTER(LEN=14) :: FILENAME
      Timers(iInitInfos)%description                            = 'InitInfos'
      Timers(iProlongateParentsData)%description                = 'ProlongateParentsData'
      Timers(iChildMaskOverlaps)%description                    = 'ChildMaskOverlaps'
      Timers(iUpdateOverlaps)%description                       = 'UpdateOverlaps'
      Timers(iApplyOverlaps)%description                        = 'ApplyOverlaps'
      Timers(iAfterOverlaps)%description                        = 'AfterOverlaps'
      Timers(iParticleUpdate)%description                       = 'ParticleUpdate'
      Timers(iApplyPhysicalBCs)%description                     = 'ApplyPhysicalBCs'
      Timers(iApplyEllipticBCs)%description                     = 'ApplyEllipticBCs'
      Timers(iSetErrFlags)%description                          = 'SetErrFlags'
      Timers(iAgeNodesChildren)%description                     = 'AgeNodesChildren'
      Timers(iBackUpNodes)%description                          = 'BackUpNodes'
      Timers(iCreateChildrens)%description                      = 'CreateChildrens'
      Timers(iDistributeChildrens)%description                  = 'DistributeChildrens'
      Timers(iInheritOverlapsOldChildren)%description           = 'InheritOverlapsOldChildren'
      Timers(iInheritNeighborsChildren)%description             = 'InheritNeighborsChildren'
      Timers(iInheritOverlapsNewChildren)%description           = 'InheritOverlapsNewChildren'
      Timers(iInheritOldNodeOverlapsChildren)%description       = 'InheritOldNodeOverlapsChildren'
      Timers(iInheritNewNodeOverlapsChildren)%description       = 'InheritNewNodeOverlapsChildren'
      Timers(iScheduledAdvanceGrids)%description                = 'ScheduleAdvanceGrids'
      Timers(iAdvanceGrids)%description                         = 'AdvanceGrids'
      Timers(iCompleteAdvancedGrids)%description                = 'CompleteAdvancedGrids'
      Timers(iWaitingAdvances)%description                      = 'WaitingAdvances'
      Timers(iElliptic)%description                             = 'Elliptic'
      Timers(iPrintAdvance)%description                         = 'PrintAdvance'
      Timers(iApplyChildrenData)%description                    = 'ApplyChildrenData'
      Timers(iRestrictionFixups)%description                    = 'RestrictionFixups'
      Timers(iUpdateChildMasks)%description                     = 'UpdateChildMasks'
      Timers(iSyncFluxes)%description                           = 'SyncFluxes'
      Timers(iAccumulateFluxes)%description                     = 'AccumulateFluxes'
      Timers(iNullifyNeighbors)%description                     = 'NullifyNeighbors'
      Timers(iCoarsenDataForParents)%description                = 'CoarsenDataForParents'
      Timers(iClearParentProcs)%description                     = 'ClearParentProcs'
      Timers(iAfterFixups)%description                          = 'AfterFixups'
      Timers(iiRecvGridsFromParents)%description                = 'RecvGridsFromParents'
      Timers(iiSendGridsToChildren)%description                 = 'SendGridsToChildren'
      Timers(iiRecvParentsData)%description                     = 'RecvParentsData'
      Timers(iiSendChildrenData)%description                    = 'SendChildrenData' 
      Timers(iiRecvOverlapsNeighbors)%description               = 'RecvOverlapsNeighbors'
      Timers(iiSendOverlapsNeighbors)%description               = 'SendOverlapsNeighbors'
      Timers(iiRecvOldNodeOverlaps)%description                 = 'RecvOldNodeOverlaps' 
      Timers(iiSendOverlapsToOldNodesChildren)%description      = 'SendOverlapsToOldNodesChildren'
      Timers(iiSendOverlapsToNodesOldChildren)%description      = 'SendOverlapsToNodesOldChildren'
      Timers(iiRecvOverlaps)%description                        = 'RecvOverlaps'
      Timers(iiSendOverlaps)%description                        = 'SendOverlaps'
      Timers(iiRecvNeighboringChildren)%description             = 'RecvNeighboringChildren'
      Timers(iiSendNeighboringChildren)%description             = 'SendNeighboringChildren'
      Timers(iiRecvOverlappingChildrenFromNewNodes)%description = 'RecvOverlappingChildrenFromNewNodes' 
      Timers(iiSendOverlappingChildrenToOldNodes)%description   = 'SendOverlappingChildrenToOldNodes'
      Timers(iiRecvOverlappingChildrenFromOldNodes)%description = 'RecvOverlappingChildrenFromOldNodes'
      Timers(iiSendOverlappingChildrenToNewNodes)%description   = 'SendOverlappingChildrenToNewNodes'
      Timers(iiRecvChildrenData)%description                    = 'RecvChildrenData'
      Timers(iiSendParentsData)%description                     = 'SendParentsData'
      Timers(iiRecvFluxes)%description                          = 'RecvFluxes'
      Timers(iiSendFluxes)%description                          = 'SendFluxes'
      Timers(iiRecvEllipticData)%description                    = 'RecvEllipticData'
      Timers(iiSendEllipticData)%description                    = 'SendEllipticData'
      Timers(iAMR)%description                                  = 'AMR'
      Timers(iProcessData)%description                          = 'ProcessData'
      Timers(iWriteData)%description                            = 'WriteData'
      Timers(iBarrier)%description                              = 'Barrier'
      Timers(iTestBadCFL)%description                           = 'TestBadCFL'
      DO i=1,MaxTimers
         Timers(i)%Accumulator=0
      END DO
      IF (lTimingLog) THEN
         write(FILENAME,'(A6,I4.4,A4)') "timer_",mpi_id,".log"
         OPEN (UNIT=TIMER_LOG_HANDLE, file=FILENAME, status="unknown")
         write(TIMER_LOG_HANDLE, '(A,I4.4)') '# Timer_', MPI_ID
      END IF
      InitTime=MPI_WTime()
      CALL MPI_ALLREDUCE(MPI_IN_PLACE, InitTime, 1, MPI_DOUBLE_PRECISION, MPI_SUM, MPI_COMM_WORLD, iErr)
      InitTime=InitTime/MPI_NP
   END SUBROUTINE TimerInit


   SUBROUTINE WriteStats()
      INTEGER :: iErr, i
      REAL(8) :: TotalAccumulators(1:MaxTimers,-2:MaxDepth)
      REAL(8) :: TotalAmrTime
      REAL(8) :: TotalCellUPdates
      DO i=1, MaxTimers
         TotalAccumulators(i,:)=Timers(i)%Accumulator
      END DO

      CALL MPI_ALLREDUCE(MPI_IN_PLACE, TotalAccumulators, (MaxDepth+3)*MaxTimers, MPI_DOUBLE_PRECISION, MPI_SUM, MPI_COMM_WORLD, iErr)
      TotalAccumulators=TotalAccumulators/REAL(MPI_NP)


      !   CALL MPI_ALLREDUCE(MPI_IN_PLACE, NumCellUpdatesByLevel, MaxLevel+1, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, iErr)
      !   CALL MPI_ALLREDUCE(MPI_IN_PLACE, EffectiveCellUpdatesByLevel, MaxLevel+1, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, iErr)

      IF (MPI_ID == 0) THEN
         TotalAmrTime=TotalAccumulators(iAmr,Baselevel)
         DO i=BaseLevel,MaxLevel-1
            TotalAccumulators(iAMR,i) = TotalAccumulators(iAMR,i) - TotalAccumulators(iAMR,i+1)
         END DO

         write(*,*) ' ========== Relative Times overall =========== '
         DO i=1,MaxTimers
            write(*,'(A42, 20F12.2)') Timers(i)%description, TotalAccumulators(i,BaseLevel:MaxLevel) /TotalAmrTime*100d0
         END DO


         write(*,*) ' ========== Relative Times Across all levels =========== '
         DO i=1, MaxTimers
            write(*,'(A42, 20F12.2)') Timers(i)%description, SUM(TotalAccumulators(i,BaseLevel:MaxLevel))/TotalAmrTime*100d0
         END DO


         write(*,*) ' ========== Relative Times within Level =========== '
         DO i=1,MaxTimers
            write(*,'(A42, 20F12.2)') Timers(i)%description, TotalAccumulators(i,BaseLevel:MaxLevel)/TotalAccumulators(iAMR,BaseLevel:MaxLevel)*100d0
         END DO

         write(*,*) ' ========== Relative Times of each Level =========== '
         DO i=1,MaxLevel
            write(*,'(A38, I4, 20F12.2)') "Level ", i, TotalAccumulators(iAMR,i)/SUM(TotalAccumulators(iAMR,BaseLevel:MaxLevel))*100d0
         END DO


         !      write(*,*) ' ========== Cells by level ==============='
         !      DO i=0,MaxLevel
         !         write(*,'(I4,2I13)') i, NumCellUpdatesByLevel(i), EffectiveCellUpdatesByLevel(i)
         !      END DO

         !      write(*,*) ' ========== Filling Fraction By Level =========== '
         !      DO i=1,MaxLevel      
         !         write(*,'(I4,2F13.3)') i, REAL(NumCellUpdatesByLevel(i))/REAL(NumCellUpdatesByLevel(i-1)*2**(nDim+1)), REAL(EffectiveCellUpdatesByLevel(i))/REAL(NumCellUpdatesByLevel(i-1)*2**(nDim+1))
         !      END DO

         write(*,*) "Total AMR Time=", SUM(TotalAccumulators(iAMR,BaseLevel:MaxLevel))
         !      write(*,*) "Total WorkLoad=", SUM(NumCellUpdatesByLevel(:)), SUM(EffectiveCellUpdatesByLevel(:))



      END IF

      CALL MPI_ALLREDUCE(MPI_IN_PLACE, AdvancePredictor%Accumulator, MaxLevel+3, MPI_DOUBLE_PRECISION, MPI_SUM, MPI_COMM_WORLD, iErr)

      CALL MPI_ALLREDUCE(MPI_IN_PLACE, AdvanceTimer%Accumulator, MaxLevel+3, MPI_DOUBLE_PRECISION, MPI_SUM, MPI_COMM_WORLD, iErr)
      CALL MPI_ALLREDUCE(REAL(SUM(InternalCellUpdates), 8),TotalCellUpdates, 1, MPI_DOUBLE_PRECISION, MPI_SUM, MPI_COMM_WORLD, iErr)

      IF (MPI_ID == 0) THEN
         write(*,'(A,16E25.16)') "Predicted work load", AdvancePredictor%Accumulator(0:MaxLevel)
         write(*,'(A,16E25.16)') "Actual advance work", AdvanceTimer%Accumulator(0:MaxLevel)

         write(*,'(A6,9A13,2A8)') 'stats', 'AdvanceTimer', 'BarrierTimer', 'CPUTime', 'OtherTime', 'CellUpdates', 'iThreaded', 'LevelBalance', 'LevelBalance', 'MaxLevel', 'mx', 'MPI_NP'
         write(*,'(A6,4E13.4,I13,I13,2E13.4,I13,2I8)') 'stats', SUM(TotalAccumulators(iAdvanceGrids,:)), SUM(TotalAccumulators(iBarrier,:)), TotalAmrTime, TotalAmrTime-SUM(TotalAccumulators(iBarrier,:))-SUM(AdvanceTimer%Accumulator(0:MaxLevel))/MPI_NP, nint(TotalCellUpdates), iThreaded, LevelBalance, MaxLevel, Gmx(1), MPI_NP

      END IF
      IF (lTimingLog) THEN
         CLOSE(TIMER_LOG_HANDLE)
      END IF

   END SUBROUTINE WriteStats
END MODULE Timing
!   REAL(8), DIMENSION(4) :: EfficiencyStats
!   DO i=1,MaxTimers
!      write(*,'(
!   PRINT *
!   PRINT *, "PROC ", MPI_id, " COMMUNICATION STAGES"
!!   DO n = 1, nStages
!      PRINT "('Proc ', i2, ' stage ', i2, ' times = ', 9f9.3, ' seconds.')", MPI_id, n, stage_times(:,n)
!   END DO

!   PRINT *
!   PRINT *, "PROC ", MPI_ID, " SERIAL STAGES"

!   PRINT "('Proc ', i2, ' ClearParentProcs times               = ', 9f9.3, ' seconds.')", MPI_id, tClearParentProcs
!   PRINT "('Proc ', i2, ' AfterFixups times                    = ', 9f9.3, ' seconds.')", MPI_id, tAfterFixups
!   PRINT "('Proc ', i2, ' AdvanceEfficiency                    = ', 9f9.3, ' %.')", MPI_id, 100d0*(1d0-sum(tPrintAdvance)/sum(tCompleteAdvancedGrids+tWaitingAdvances))
!   PRINT "('Proc ', i2, ' ExpectedEfficiency                   = ', 9f9.3, ' %.')", MPI_id, 100d0*(1d0-sum(TimeWastedByLevel)/sum(AccumulatedWorkDoneByLevel))
!   EfficiencyStats=(/sum(TimeWastedByLevel),sum(AccumulatedWorkDoneByLevel),sum(tPrintAdvance),sum(tCompleteAdvancedGrids+tWaitingAdvances)/)
!   CALL MPI_ALLREDUCE(MPI_IN_PLACE, EfficiencyStats, 4, MPI_DOUBLE_PRECISION, MPI_SUM, MPI_COMM_WORLD, iErr)
!   PRINT "('Proc ', i2, ' ExpectedOverallEfficiency            = ', 9f9.3, ' %.')", MPI_id, 100d0*(1d0-EfficiencyStats(1)/EfficiencyStats(2))
!   PRINT "('Proc ', i2, ' ActualOverallEfficiency              = ', 9f9.3, ' %.')", MPI_id, 100d0*(1d0-EfficiencyStats(3)/EfficiencyStats(4))
! Shut down the MPI job.

