!#########################################################################
!		
!    Copyright (C) 2003-2012 Department of Physics and Astronomy,
!                            University of Rochester,
!                            Rochester, NY
!
!    distribution_control.f90 is part of AstroBEAR.
!
!    AstroBEAR is free software: you can redistribute it and/or modify	  
!    it under the terms of the GNU General Public License as published by 
!    the Free Software Foundation, either version 3 of the License, or    
!    (at your option) any later version.
!
!    AstroBEAR is distributed in the hope that it will be useful, 
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU General Public License for more details.
!
!    You should have received a copy of the GNU General Public License
!    along with AstroBEAR.  If not, see <http://www.gnu.org/licenses/>.
!
!#########################################################################
MODULE DistributionControl
   USE GlobalDeclarations
   USE DataDeclarations
   USE TreeDeclarations
   USE DistributionDeclarations
   USE Scheduling
   USE DataInfoOps
   USE Timing
   IMPLICIT NONE


   REAL(KIND=qPREC), PARAMETER :: Safety_Fact=.9d0
   REAL(KIND=qPREC) :: TotalNewWorkLoad, CoarserWorkLoad
   REAL(KIND=qPREC) :: MeanWorkLoadByLevel(-1:MaxDepth)=0
   REAL(KIND=qPREC) :: SplitFactor(-2:MaxDepth)=1d0
   LOGICAL :: lDistVerbose=.false.
   
CONTAINS


   SUBROUTINE DistributionInit
      INTEGER :: temp, n
!      ALLOCATE(MyRemainingWorkLoad(-2:MaxLevel))
      ALLOCATE(ParentProcs(-1:MaxLevel))
      ALLOCATE(ChildProcs(-2:MaxLevel-1))      
!      MyRemainingWorkLoad=0
      levels(-2)%MPI_COMM=MPI_COMM_WORLD
      DO n = -1, MaxLevel
         NULLIFY(ParentProcs(n)%p)
         NULLIFY(ChildProcs(n-1)%p)
      END DO
      IF (MPI_ID /= 0) THEN
         ALLOCATE(ParentProcs(-1)%p(1))
         ParentProcs(-1)%p=0
      END IF
      temp=1      
      DO Rootn=1,1000
         temp=temp*2
         if (temp >= maxval(levels(0)%mX)) exit
      END DO
      di=REAL(2**Rootn)/REAL(levels(0)%mX(1))
      dj=REAL(2**Rootn)/REAL(levels(0)%mX(2))
      dk=REAL(2**Rootn)/REAL(levels(0)%mX(3))

   END SUBROUTINE DistributionInit

   !> Updates workload per level per step g_l^p
   SUBROUTINE GetLevelLoad(n)
      INTEGER :: n,i
      TYPE(NodeDefList), POINTER :: nodelist
      TYPE(NodeDef), POINTER :: node
      WorkLoadByLevelPerStep(n,:)=0
      nodelist=>Nodes(n)%p      
      IF (n < 0) RETURN
      
      DO WHILE (associated(nodelist))
         node=>nodelist%self        
         DO i=1, levels(n)%steps
            node%info%CostPerGrid(i) = GetMyCosts(node%info,i)
         END DO
         WorkLoadByLevelPerStep(n,1:levels(n)%steps)=WorkLoadByLevelPerStep(n,1:levels(n)%steps)+node%info%costperGrid(1:levels(n)%steps)
         nodelist=>nodelist%next
      END DO
      MeanWorkLoadByLevelPerStep(n)=sum(WorkLoadByLevelPerStep(n,1:levels(n)%steps))/real(levels(n)%steps)
      
   END SUBROUTINE GetLevelLoad

   SUBROUTINE ClearCompletedWorkLoads(n)
      INTEGER :: n
      WorkDoneByLevel(n)=0d0
   END SUBROUTINE ClearCompletedWorkLoads


   SUBROUTINE SortNodes(n)
      INTEGER :: n, nNodes, i
      INTEGER, DIMENSION(:,:,:), POINTER :: nodegrids
      TYPE(NodeDefList), POINTER :: SortedNodes, LastSortedNode, temp, nodelist
      nNodes=NodeCount(Nodes(n)%p)
      IF (nNodes <= 1) RETURN
      NULLIFY(SortedNodes, LastSortedNode)
      ALLOCATE(nodegrids(3,2,nNodes))
      nodelist=>Nodes(n)%p      
      i=0
      DO WHILE (associated(nodelist))
         i=i+1
         nodegrids(:,:,i)=Nodelist%self%box%mGlobal
         nodelist=>nodelist%next
      END DO
      CALL HilbertSort(nodegrids, n)
      nodelist=>Nodes(n)%p
      i=1

      DO WHILE (i <= nNodes)
         IF (ALL(nodegrids(:,:,i) == nodelist%self%box%mGlobal)) THEN
            CALL AddNodeToList(nodelist%self, LastSortedNode, SortedNodes)
            i=i+1
         END IF
         nodelist=>nodelist%next
         IF (.NOT. (ASSOCIATED(nodelist))) nodelist=>Nodes(n)%p
      END DO
      temp=>Nodes(n)%p
      Nodes(n)%p=>SortedNodes
      LastLocalNode(n)%p=>LastSortedNode
      CALL ClearNodeList(temp)      

      DEALLOCATE(nodegrids)
   END SUBROUTINE SortNodes


   !> @brief Creates and distributes new children of node
   !! @param  n level
   SUBROUTINE DistributeChildrens(n, lsplitOpt)
      INTEGER :: n
      TYPE(NodeDefList), POINTER :: nodelist, children, childlist
      TYPE(NodeDef), POINTER :: node, child, newchild
      INTEGER :: cp, pp, i, iErr, j, l, nChildren, m, test,k, childsteps_remaining, nextproc
      INTEGER, DIMENSION(:), ALLOCATABLE :: parentproc, childproc
      REAL(KIND=qPREC), DIMENSION(:), ALLOCATABLE :: childtimes, ChildLoad, newload, maxchildtimes
      REAL(KIND=qPREC), DIMENSION(:,:), ALLOCATABLE :: workloadsbyprocessor, parentoverlaps, childoverlaps
      REAL(KIND=qPREC) :: MyWorkLoads(3), TotalWOrkLoad, MeanWorkLoad, LocalCost, childoverlap, parentoverlap, ChildCostRemaining, temp, TotalLocalCost
      LOGICAL, OPTIONAL :: lsplitOpt
      LOGICAL :: lsplit
      REAL(KIND=qPREC), DIMENSION(:), POINTER :: splitweight, maxsplitweight
      INTEGER, DIMENSION(:), POINTER :: splitproc
      INTEGER, DIMENSION(:,:,:), POINTER :: newgrids, childgrids, oldgrids
      TYPE(NodeBox), POINTER :: child_box
      INTEGER, DIMENSION(:), ALLOCATABLE :: req
      INTEGER, DIMENSION(:), ALLOCATABLE :: steps_remaining
      INTEGER, DIMENSION(MPI_STATUS_SIZE) :: status
      REAL(KIND=qPREC) :: TotalExcessPerProc, DesiredExcessPerProc, SplitExcessPerProc, alpha
      CALL StartTimer(iDistributeChildrens, n)


      !First get local cost of new children

      nodelist=>Nodes(n)%p     
      localcost=0

      DO WHILE (associated(nodelist))
         node=>nodelist%self
         children=>node%children
         DO WHILE (ASSOCIATED(children))
            LocalCost=LocalCost+max(GetChildCosts(children%self%box%mGlobal,n+1), 1e-20) !product(children%self%box%mGlobal(:,2)-children%self%box%mGlobal(:,1)+1)*levels(n+1)%steps
            children=>children%next
         END DO
         nodelist=>nodelist%next
      END DO


      !Take care of single processor stuff before returning
      IF (MPI_NP == 1 .AND. n <= FinestLevel) THEN
         IF (LocalCost > 0) THEN
            FinestLevel=n+1
         ELSE
            FinestLevel=min(n,FinestLevel)
         END IF
         RestartStep=lRequestRestart
         CALL StopTimer(iDistributeChildrens, n)
         RETURN
      END IF


      !Determine whether or not we are splitting
      IF (Present(lsplitopt)) THEN !On restarts we don't split grids to better distribute until the data has been read in
         lsplit=lsplitOpt
      ELSE IF (n == -2) THEN !Never split level -2
         lsplit=.false.
      ELSE
         lsplit=.true. !Default behavior is to split - based on min and max values
      END IF


      !Allocate space for 3 quantities for each processor.  (Coarser Level Work Remaining, New Child Loads, RestartFlag)
      ALLOCATE(WorkLoadsByProcessor(3,MPI_NP))

      !Calculate Coarser level work remaining per level n step
      IF (iThreaded == NON_THREADED .OR. n < -1) THEN ! .OR. n < 0) THEN
         MyWorkLoads(1) = 0d0 !Shouldn't have any coarser work left
      ELSE         
         ! Calculate steps remaining on each level
         ALLOCATE(steps_remaining(-1:MaxLevel))
         steps_remaining=0
         DO i=-1, MaxLevel
            steps_remaining(i) = sum((levels(-1:i)%steps-levels(-1:i)%step) * 2**(/((i-j),j=-1,i)/))+1
            !            write(*,*) 'levels%steps', levels(-1:MaxLevel)%step
            !            IF (MPI_ID == 0) write(*,*) i, steps_remaining(i), (levels(0:i)%steps-levels(0:i)%step), 2**(/((i-j),j=0,i)/)
         END DO

         ! Determine projected coarser work that needs to be done per level n step
         MyWorkLoads(1) = SUM(MeanWorkLoadByLevelPerStep(-1:n)*steps_remaining(-1:n)-WorkDoneByLevel(-1:n)) / steps_remaining(n)


         IF (lDistVerbose) THEN
            DO i=0, MPI_NP
               IF (MPI_ID == i) THEN
                  write(*,*) '**********************************************************'
                  write(*,'(A,I4,A,E13.5,A,E13.5)') 'Processor ', MPI_ID, ' has a coarser work load of ', MyWorkLoads(1), ' and a local child cost of ', LocalCost
                  write(*,'(A50, 10E13.5)') 'WorkLoadByLevelPerStep', MeanWorkLoadByLevelPerStep
                  write(*,'(A50, 10I4)') 'Steps Remaining', steps_remaining(-1:MaxLevel)
                  write(*,'(A50, 10E13.5)') 'Work Done By Level', WorkDoneByLevel(-1:n)
                  write(*,'(A50, 10E13.5)') 'Remaining projected work load by level', MeanWorkLoadByLevelPerStep(-1:n)*steps_remaining(-1:n) - WorkDoneByLevel(-1:n)
                  write(*,'(A50, 10E13.5)') 'Remaining projected work load to balance this step', (MeanWorkLoadByLevelPerStep(-1:n)*steps_remaining(-1:n) - WorkDoneByLevel(-1:n))/steps_remaining(n)

               END IF
               CALL MPI_BARRIER(MPI_COMM_WORLD, iERr)
            END DO
         END IF

      END IF
      MyWorkLoads(2) = LocalCost 
      MyWorkLoads(3)=merge(1d0,0d0,lRequestRestart)
      CALL StartTimer(iBarrier, n)
      CALL MPI_ALLGather(MyWorkLoads, 3, MPI_DOUBLE_PRECISION, WorkLoadsByProcessor, 3, MPI_DOUBLE_PRECISION, levels(n)%MPI_COMM, iErr)
      CALL StopTimer(iBarrier, n)

      ! Adjust Finest level depending on whether or not any processor created children
      FinestLevel=merge(n+1, min(n, FinestLevel), SUM(WorkLoadsByProcessor(2,:)) > 0 .AND. n <= FinestLevel)

      ! Determine if any processor has requested restarts
      RestartStep=ANY(WorkLoadsByProcessor(3,:) == 1d0)

      !Calculate total new work load, total coarser work load, and update mean workloadbylevel for child level
      TotalNewWorkLoad=SUM(WorkLoadsByProcessor(2,:))
      CoarserWorkLoad=SUM(WorkLoadsByProcessor(1,:))

      MeanWorkLoadByLevel(n+1)=TotalNewWorkLoad/REAL(MPI_NP)*splitfactor(n) !not very accurate as it does not account for splitting


      !now we have \eta_l^p and c_l^p and we want to balance \eta_l^p+childsteps_remaining*c_l^p

      !Calculate total new projected remaining workload for child level and all coarser levels and the average
      TotalWorkLoad=CoarserWorkLoad+TotalNewWorkLoad 
      MeanWorkLoad=TotalWorkLoad/REAL(MPI_NP, 8)

      IF (iThreaded == NON_THREADED .OR. n < -1) THEN !Don't use finer level excesses to adjust splitting since approach is non threaded
         TotalExcessPerProc=0d0
      ELSE
         TotalExcessPerProc=sum(MeanWorkLoadByLevel(n+2:MaxLevel)*steps_remaining(n+1:MaxLevel-1))/steps_remaining(n)  !How much finer work can we assume will be available to balance
      END IF


      DesiredExcessPerProc=LevelBalance(1)*TotalExcessPerProc !Adjust split points based on degree of global breadth first approach
      SplitExcessPerProc=(LevelBalance(2)-LevelBalance(1))*TotalExcessPerProc ! Determine excess per processor allowed above the split point

      ! Calculate partition points for new load
      ALLOCATE(ChildLoad(0:MPI_NP), NewLoad(0:MPI_NP))



      IF (LevelBalance(2) == 0d0 .OR. iThreaded == NON_THREADED .OR. n < -1) THEN
         ChildLoad(0)=0
         NewLoad(0) = 0 
         DO i=1, MPI_NP
            NewLoad(i)=NewLoad(i-1)+MeanWorkLoad-WorkLoadsByProcessor(1,i)  !Split points for omega^{n+1}
            ChildLoad(i)=ChildLoad(i-1)+WorkLoadsByProcessor(2,i)       !Split points for c_p
         END DO
      ELSE
         ChildLoad(0)=0
         NewLoad(0) = 0 
         temp=0
         DO i=1, MPI_NP
            NewLoad(i)=NewLoad(i-1)+max(MeanWorkLoad-WorkLoadsByProcessor(1,i)+DesiredExcessPerProc,0d0)     !Split points for omega^{n+1}
            temp=temp+MeanWorkLoad-WorkLoadsByProcessor(1,i)+DesiredExcessPerProc     !Split points for omega^{n+1}
            ChildLoad(i)=ChildLoad(i-1)+WorkLoadsByProcessor(2,i)      !Split points for c_p
         END DO
         NewLoad=NewLoad*temp/NewLoad(MPI_NP)

         IF (lDistVerbose .AND. MPI_ID == 0) THEN
            write(*,*) 'loads for level', n
            write(*,'(A,10E15.6)') 'Coarser Work Loads  = ',  WorkLoadsByProcessor(1,:)
            write(*,'(A,10E15.6)') 'Predicted Finer Work Loads  = ',  sum(MeanWorkLoadByLevel(n+2:MaxLevel)*steps_remaining(n+1:MaxLevel-1))/steps_remaining(n)
            write(*,'(A,10E15.6)') 'New Work Loads = ', WorkLoadsByProcessor(2,:)
            write(*,'(A,10E15.6)') 'New Load Partitions = ', NewLoad(:)
            write(*,'(A,10E15.6)') 'Current Load Partitions = ', ChildLoad(:)
         END IF
      END IF


      ALLOCATE(parentproc(MPI_NP), childproc(MPI_NP), childtimes(MPI_NP), maxchildtimes(MPI_NP))

      childproc=-1
      childtimes=0d0
      maxchildtimes=0d0

      !Do partioning and identify child & parent processors
      cp = 0
      pp = 0
      DO i=1, MPI_NP
         parentoverlap=min(NewLoad(MPI_ID+1), ChildLoad(i))-max(NewLoad(MPI_ID), ChildLoad(i-1))
         childoverlap=min(ChildLoad(MPI_ID+1), NewLoad(i))-max(ChildLoad(MPI_ID), NewLoad(i-1))

         IF (childoverlap > 0) THEN               
            cp=cp+1
            childproc(cp)=i-1
            childtimes(cp)=childoverlap
            maxchildtimes(cp)=childoverlap*(1d0+SplitExcessPerProc/(NewLoad(i)-NewLoad(i-1))) !Distribute excess per proc in proportion to the relative work load desired to be assigned to that child proc
         END IF
         IF (parentoverlap > 0) THEN
            pp=pp+1
            parentproc(pp)=i-1
         END IF
      END DO



      IF (lDistVerbose) THEN
         DO i=0, MPI_NP
            IF (MPI_ID == i) THEN
               write(*,*) 'parent procs for processor ', MPI_ID, ' = ', parentproc(1:pp)
               write(*,*) 'child procs and times for processor ', MPI_ID, ' = ', childproc(1:cp), childtimes(1:cp)
            END IF
            CALL MPI_BARRIER(MPI_COMM_WORLD, iErr)
         END DO

         IF (MPI_ID == 0) THEN
            write(*,'(A,8E13.2)') 'New Load =', NewLoad
            write(*,'(A,8E13.2)') 'Current Load =', ChildLoad
            ALLOCATE(parentoverlaps(MPI_NP, MPI_NP), childoverlaps(MPI_NP, MPI_NP))
            DO i=1, MPI_NP
               DO j=1, MPI_NP
                  parentoverlaps(i,j)=min(NewLoad(j), ChildLoad(i))-max(NewLoad(j-1), ChildLoad(i-1))
                  childoverlaps(i,j)=min(ChildLoad(j), NewLoad(i))-max(ChildLoad(j-1), NewLoad(i-1))
               END DO
            END DO
            !            write(*,*) 'parentoverlaps'
            !            write(*,'(2E13.2)') parentoverlaps
            !            write(*,*) 'childoverlaps'
            !            write(*,'(2E13.2)') childoverlaps
            DEALLOCATE(parentoverlaps, childoverlaps)
         END IF
      END IF

      !Rebuild ParentProcs and ChildProcs lists
      IF (ASSOCIATED(ParentProcs(n+1)%p)) DEALLOCATE(ParentProcs(n+1)%p)
      IF (ASSOCIATED(ChildProcs(n)%p)) DEALLOCATE(ChildProcs(n)%p)
      IF (pp > 0) THEN
         ALLOCATE(ParentProcs(n+1)%p(pp))
         ParentProcs(n+1)%p=parentproc(1:pp)
      END IF
      IF (cp > 0) THEN
         ALLOCATE(ChildProcs(n)%p(cp))
         ChildProcs(n)%p=childproc(1:cp)
      END IF


      DEALLOCATE(ChildLoad, NewLoad, WorkLoadsByProcessor, parentproc, childproc)
      IF (ALLOCATED(steps_remaining)) DEALLOCATE(steps_remaining)



      ! Now we know how much workload we would like to assign to each child processor as well as the maximum we can assign before splitting it is time to distribute the workload.
      ! We can either do an ordered distribution where processor assignments follow a strict hilbert ordering
      ! Or we can do a local knapsack type algorithm     

      ! We have a list of nodes - and their children to distribute
      ! We have a list of available childprocs 
      ! and we have a list of desired work load allocations (childtimes) as well as a maximum work load allocation (maxchildtimes)


      LocalCost=0d0 !Recalculate local costs for split children

      IF (lKnapSack) THEN
         ! Assign heaviest child nodes first to best matching bin (one with the desired workload closest to actual and with a maximum work load that is greater)
         ! If none is available then we will have to split grid into at least two pieces...
         ! So then check for best matcing pairs that will accomplish the same
         ! Then triplets... and so on....
         ! Then split grids based on desired weights
         ! Then move to next smallest grid and repeat...

         write(*,*) 'Local Knapsack algorithm not yet implemented'

      ELSE
         ! Go through children in order and processors in order assigning children and splitting if a child assignemnt will overload the child processor

         !         ALLOCATE(splitweight(MPI_NP), splitproc(MPI_NP), maxsplitweight(MPI_NP))   
         nextproc = 1
         nodelist=>Nodes(n)%p      

         DO WHILE (associated(nodelist))
            node=>nodelist%self
            children=>node%children             !backup current chidllist to later destroy
            childlist=>node%children            !Child list pointer
            NULLIFY(node%children)              !Destroy current nodes child list so it can be rebuilt
            NULLIFY(node%lastchild)

            DO WHILE (ASSOCIATED(childlist))    
               child=>childlist%self
               ChildCostRemaining=max(GetChildCosts(childlist%self%box%mGlobal, n+1),1e-20)
               IF (cp == 0) THEN !Random check that should not be needed... unless profile.data gives negative workloads...
                  write(*,*) MPI_ID, 'have child but no child processors', childlist%self%box%mGlobal, 'perhaps profile.data has negative entries?'
                  STOP
               END IF
               i=nextproc
               j=nextproc
               DO WHILE (SUM(MaxChildTimes(i:j)) < ChildCostRemaining)
                  IF (j == cp) EXIT
                  j=j+1
               END DO
               IF (lDistVerbose) write(*,*) MPI_ID, ' splitting into ', j-i+1 , ' pieces'

               ! Now we know the minimum number of subgrids required
               ALLOCATE(splitproc(1:j-i+1))
               ALLOCATE(splitweight(1:j-i+1))

               splitproc=ChildProcs(n)%p(i:j)

               IF (SUM(ChildTimes(i:j)) >= ChildCostRemaining) THEN !Now let's use the desired times for i:j-1 and adjust j to match                  
                  splitweight(1:j-i)=ChildTimes(i:j-1)
                  splitweight(j-i+1)=ChildCostRemaining-SUM(ChildTimes(i:j-1))
                  nextproc=j !increase i to the next processor with any remaining time
                  IF (lDistVerbose) write(*,*) MPI_ID, 'shrinking last grid only'
               ELSE

                  !Now we want to adjust the splits so they are closer to the desired values instead of the maximum values
                  !Each processor has a range from x to X and we know that X+Y+Z > c'
                  !We want x'+y'+z'=c' where x < x' < X and y < y' < Y and z<z'<Z
                  !Take x+y+z=c and X+Y+Z=C and calculate alpha = (c'-c)/(C-c) where 0 < alpha < 1
                  !Then x'=x+alpha*(X-x), y'=y+alpha*(Y-y) and z'=z+alpha*(Z-z)

                  IF (SUM(MaxChildTimes(i:j)-ChildTimes(i:j)) < 1e-10) THEN
                     alpha=1d0
                  ELSE
                     alpha=(ChildCostRemaining-SUM(ChildTimes(i:j)))/SUM(MaxChildTimes(i:j)-ChildTimes(i:j))
                  END IF
                  IF (lDistVerbose) write(*,*) MPI_ID, 'stretching all grids', alpha
                  splitweight=ChildTimes(i:j)+(MaxChildTimes(i:j)-ChildTimes(i:j))*alpha                  
                  nextproc=min(j+1,cp)
               END IF
               ChildTimes(i:j)=ChildTimes(i:j)-splitweight
               MaxChildTimes(i:j)=MaxChildTimes(i:j)-splitweight                 
               IF (ChildTimes(j) <= 0d0) nextproc=min(j+1,cp)
               !               write(*,*) MPI_ID, 'i,j,i-j+1', i, j, j-i+1
               j=j-i+1           
               IF (.NOT. lsplit) THEN                         ! We can't split child
                  ALLOCATE(newgrids(3,2,1))
                  newgrids(:,:,1)=child%box%mGlobal
                  splitproc(1)=ChildProcs(n)%p(i-1+sum(maxloc(splitweight(1:j)))) !Assign child to childproc with the maximum availability
                  j=1
                  !                  write(*,*) MPI_ID, 'not splitting', splitweight(1:j), splitproc(1)
               ELSE
                  ALLOCATE(newgrids(3,2,j))                   ! Calculate new set of smaller grids that have the correct weights
                  !                 write(*,*) 'splitting', child%box%mGlobal
                  DO 
                     newgrids=0
                     IF (lDistVerbose) write(*,*) MPI_ID, ' split weights ', splitweight(1:j)
                     CALL HilbertSplit(leveldown(child%box%mGlobal, n+1), splitweight(1:j), newgrids, (/1,j/), n)                         
                     !                     write(*,'(4I6)') newgrids(1:2,1:2,1:j)
                     l=j
                     m=1
                     DO WHILE (m <= l)
                        IF (newgrids(1,1,m) == 0) THEN
                           !                          write(*,*) MPI_ID, ' removing ', m, j, l
                           l=l-1
                           IF (l == 0) THEN
                              write(*,*) 'huh'
                              STOP
                           END IF
                           newgrids(:,:,m:l)=newgrids(:,:,m+1:l+1)
                           splitweight(m:l)=splitweight(m+1:l+1)
                           splitproc(m:l)=splitproc(m+1:l+1)
                           m=m-1
                        END IF
                        m=m+1
                     END DO
                     IF (l == j) EXIT !No grids to remove from splitting
                     !                    write(*,*) MPI_ID, 'reducing number of child grids to ', l
                     j=l !reduce number of grids to split
                  END DO
                  DO m=1,j
                     newgrids(:,:,m)=levelup(newgrids(:,:,m), n)
                  END DO
               END IF

               DO l=1,j
                  !Do some sanity checks
                  IF (ALL(newgrids(:,:,l)==0)) CYCLE !Unable to split into the desired number of pieces so remove unused procs and adjust weights

                  !Just checking that hilbert split didn't return any illegal new grids
                  IF (ANY(newgrids(:,1,l) < child%box%mGlobal(:,1)) .OR. ANY(newgrids(:,2,l) > child%box%mGlobal(:,2))) THEN
                     write(*,*) l, newgrids(:,:,l), child%box%mGlobal
                     write(*,*) "error in distribution", lsplit
                     write(*,'(A,6I4)') "childgrid=", child%box%mGlobal
                     write(*,'(A,6I4)') "splitgrid=", newgrids(:,:,l)
                     STOP
                  END IF
                  IF (ANY(newgrids(1:nDim,1,l) > newgrids(1:nDim,2,l))) THEN
                     write(*,*) 'illegal grid'
                     write(*,*) newgrids(:,:,l)
                     STOP
                  END IF
                  if (splitproc(l) < 0 .OR. splitproc(l) > MPI_NP-1) THEN
                     write(*,*) 'illegal target processor', splitproc(l)
                     STOP
                  END if

                  !Create child
                  NULLIFY(child_box)
                  CALL CreateNodeBox(newgrids(:,:,l), child_box, splitproc(l))
                  IF (lDistVerbose) write(*,*) MPI_ID, ' creating child ', child_box
                  LocalCost=LocalCost+max(GetChildCosts(child_box%mGlobal,n+1), 1e-20)
                  NULLIFY(newchild)
                  CALL AddNode(n+1, child_box, newchild)
                  CALL AddParent(newchild, node)
                  CALL AddChild(node, newchild)
                  CALL DestroyNodeBox(child_box)

               END DO

               DEALLOCATE(newgrids)
               DEALLOCATE(splitproc, splitweight)
               childlist=>childlist%next
            END DO



            CALL DestroyNodeList(children)         !Destroy old childlist


            !Setup childgrids list like newsubgrids
            IF (n >= 0) THEN
               nchildren=NodeCount(node%children)
               IF (nchildren > 0) THEN
                  ALLOCATE(childgrids(3,2,nChildren))
                  children=>node%children
                  j=0
                  DO WHILE (ASSOCIATED(children))
                     j=j+1
                     childgrids(:,:,j)=children%self%box%mGlobal
                     children=>children%next
                  END DO
               ELSE
                  NULLIFY(childgrids)
               END IF
               CALL AllocChildFixups(node%info, childgrids)
               IF (ASSOCIATED(childgrids)) DEALLOCATE(childgrids)
            END IF
            nodelist=>nodelist%next         
         END DO

         !         IF (lSplit) THEN
         !            CALL MPI_ALLREDUCE(LocalCost, TotalLocalCost, 1, MPI_DOUBLE_PRECISION, MPI_SUM, levels(n)%MPI_COMM, ierr)

         ! Since MeanWorkLoadByLevel(n+1) = TotalLocalCostEst*splitfactor(n)/MPI_NP
         ! And we want splitfactor(n)=TotalLocalCost/TotalLocalCostEst
         !            splitfactor(n) = splitfactor(n)*(TotalLocalCost/REAL(MPI_NP))/MeanWorkLoadByLevel(n+1) 
         !            IF (MPI_ID == 0) write(*,*) 'Splitfactor for level ', n, ' = ', splitfactor(n)
         ! Now we can correct the workload
         !            MeanWorkLoadByLevel(n+1)=TotalLocalCost/MPI_NP
         !         END IF


      END IF

      DEALLOCATE(childtimes, maxchildtimes)

      CALL StopTimer(iDistributeChildrens, n)
   END SUBROUTINE DistributeChildrens


   !> Returns the hilbert value of a box
   !! @param mGlobal box bounds
   !! @param level box level
   FUNCTION HilbertValue(mGlobal, level)
      REAL(KIND=qPREC) :: HilbertValue
      INTEGER :: i,j,k,n, level,tempi,tempj,tempk
      INTEGER, DIMENSION(:,:) :: mGlobal
      INTEGER :: indices(3,2), sum_mGlobal(3)
      INTEGER :: h
      HilbertValue=0
      n=Rootn+max(level,0)
      sum_mGlobal(1:nDim)=sum(mGlobal(1:nDim,:),2)
      DO i=1,nDim
         IF (modulo(sum_mGlobal(i),2) == 0) THEN
            indices(i,1)=floor(half*sum_mGlobal(i))
            indices(i,2)=indices(i,1)+1
         ELSE
            indices(i,:)=sum_mGlobal(i)/2
         END IF
      END DO
      IF (nDim == 2) THEN
         DO i=indices(1,1), indices(1,2) !, ncells(1)
            DO j=indices(2,1), indices(2,2) !, ncells(1)
               tempi=(i-1)*di
               tempj=(j-1)*dj
               CALL closedformhilbert2d(n,tempi,tempj,h)
               HilbertValue=HilbertValue+h
            END DO
         END DO
      ELSEIF (nDim == 3) THEN
         DO i=indices(1,1), indices(1,2) !, ncells(1)
            DO j=indices(2,1), indices(2,2) !, ncells(1)
               DO k=indices(3,1), indices(3,2)
                  tempi=(i-1)*di
                  tempj=(j-1)*dj
                  tempk=(k-1)*dk
                  CALL closedformhilbert3d(n,tempi,tempj,tempk,h)
                  HilbertValue=HilbertValue+h         
               END DO
            END DO
         END DO
      END IF
      HilbertValue=HilbertValue/REAL(product(indices(1:nDim,2)-indices(1:nDim,1)+1))
   END FUNCTION HilbertValue

   !> Sorts childgrids by hilbert value
   !! @param childgrids array of child bounds (in parent space)
   !! @param level level of parent
   SUBROUTINE HilbertSort(childgrids,level)
      INTEGER, DIMENSION(:,:,:), POINTER, INTENT(INOUT) :: childgrids
      INTEGER, DIMENSION(:,:,:), POINTER :: sortedchildgrids
      REAL(KIND=qPREC), DIMENSION(:), POINTER :: childgridhilbertvalues
      INTEGER, INTENT(IN) :: level
      INTEGER :: n,i,index
      n=size(childgrids,3)
      IF (n == 1) RETURN
      !      write(*,*) "HilbertSort", n
      ALLOCATE(sortedchildgrids(3,2,n), childgridhilbertvalues(n))
      DO i=1,n
         IF (ALL(childgrids(:,:,i) == 0)) THEN
            childgridhilbertvalues(i)=-1
         ELSE
            childgridhilbertvalues(i)=HilbertValue(childgrids(:,:,i),level)
         END IF
      END DO
      !      write(*,*) childgridhilbertvalues
      DO i=1,n
         index=minloc(childgridhilbertvalues,DIM=1)
         sortedchildgrids(:,:,i)=childgrids(:,:,index)
         childgridhilbertvalues(index)=huge(1.0)
      END DO
      DEALLOCATE(childgrids)
      childgrids=>sortedchildgrids
      NULLIFY(sortedchildgrids)
      DEALLOCATE(childgridhilbertvalues)
      NULLIFY(childgridhilbertvalues)
   END SUBROUTINE HilbertSort


   !> Splits childgrid into pieces based on size and number of weights.  
   !! @param childgrid indices of grid to split
   !! @param weights desired weights to split into
   !! @param newgrids grids created by splitting childgrid
   !! @param indx current range of newgrids, and weights to operate on
   !! @param level parent level
   !! @details Pieces are created to match weights and should represent grids that are also roughly in hilbert order
   RECURSIVE SUBROUTINE HilbertSplit(childgrid, weights, newgrids,indx,level)
      INTEGER, DIMENSION(:,:), INTENT(IN) :: childgrid
      REAL(KIND=qPREC), DIMENSION(:), INTENT(IN) :: weights
      REAL(KIND=qPREC), DIMENSION(:), ALLOCATABLE :: accweight
      INTEGER, DIMENSION(:,:,:), POINTER, INTENT(OUT) :: newgrids
      REAL(KIND=qPREC) :: leftweight, rightweight, subgridweights(2),totalweight,HilbertDiff(3,2),HilbertLeft,HilbertRight,cost(3,2), effective_cost(3,2)
      INTEGER, DIMENSION(3,2) :: leftmGlobal,rightmGlobal,subgridleft,subgridright
      INTEGER, DIMENSION(2) :: indx
      INTEGER ::ngrids,i1,i,j,splitpoint(3,2),level, loc(2)
      !      IF (SIZE(weights) == 0) THEN
      !         write(*,*) 'weights are zero sized'
      !         STOP
      !      END IF
      IF (any(weights==0)) THEN
         write(*,*) "weights are zero", weights, MPI_ID
         STOP
      END IF
      ngrids=indx(2)-indx(1)+1 
      !          write(*,'(A,6I4,20E13.2)') "HilbertSplit", childgrid, weights
      IF (indx(2) < indx(1)) THEN
         write(*,*) "err in HilbertSplit", indx
         write(*,*) childgrid
         write(*,*) weights
         write(*,*) level
         STOP
      END IF
      IF (ANY(childgrid(1:nDim,2) < childgrid(1:nDim,1))) THEN
         newgrids(:,:,indx(1):indx(2))=0
         RETURN
      ELSE
         IF (ngrids == 1) THEN
            newgrids(:,:,indx(1))=childgrid
         ELSE

            totalweight=sum(weights(indx(1):indx(2)))
            IF (ngrids==2) THEN !get processor split point
               i1=indx(1)
               leftweight=weights(indx(1))/totalweight
            ELSE
               ALLOCATE(accweight(indx(1):indx(2)))

               totalweight=sum(weights(indx(1):indx(2)))
               accweight(indx(1))=weights(indx(1))
               DO i=indx(1)+1,indx(2)
                  accweight(i)=accweight(i-1)+weights(i)
               END DO
               i1=indx(1)-1+sum(minloc(abs(accweight/totalweight-half))) !gets split point that is closest to half
               leftweight=accweight(i1)/totalweight !left weight
               DEALLOCATE(accweight)
            END IF
            rightweight=1d0-leftweight !right weight

            !Now we know where to split the processors
            !and the relative sizes of the left and right grids
            !Now we just need to decide along which direction to split and which half is the new left and the new right

            leftmGlobal=childgrid
            rightmGlobal=childgrid
            subgridweights=(/leftweight,rightweight/)

            DO i=1,nDim
               DO j=1,2
                  splitpoint(i,j)=childgrid(i,1)-1+nint((childgrid(i,2)-childgrid(i,1)+1)*subgridweights(j))
                  IF (splitpoint(i,j) >= childgrid(i,1)-1+MinimumGridPoints .AND. splitpoint(i,j) <= childgrid(i,2)-MinimumGridPoints) THEN
                     leftmGlobal(i,2)=splitpoint(i,j)
                     rightmGlobal(i,1)=splitpoint(i,j)+1
                     HilbertLeft=HilbertValue(leftmGlobal,level)
                     HilbertRight=HilbertValue(rightmGlobal,level)
                     HilbertDiff(i,j)=(-1)**(j-1)*(HilbertRight-HilbertLeft)

                  ELSE IF (childgrid(i,2)-childgrid(i,1)+1 < 2*MinimumGridPoints) THEN !Can't split
                     splitpoint(i,j)=childgrid(i,1)-1+(childgrid(i,2)-childgrid(i,1)+1)*nint(subgridweights(j))
                     leftmGlobal(i,2)=splitpoint(i,j)
                     rightmGlobal(i,1)=splitpoint(i,j)+1
                     HilbertDiff(i,j)=0 !We don't want to favor the additional load imbalancing                                         

                  ELSE !Need to round splitpoint to multiple of MinimumGridPoints
                     IF (splitpoint(i,j) < childgrid(i,1)-1+MinimumGridPoints) THEN 
                        splitpoint(i,j)=childgrid(i,1)-1+MinimumGridPoints*min(nint((childgrid(i,2)-childgrid(i,1)+1) * &
                             subgridweights(j)/MinimumGridPoints),1)

                     ELSE !splitpoint > childgrid(i,2)-MinimumGridPoints
                        splitpoint(i,j)=childgrid(i,2)-MinimumGridPoints*min(nint((childgrid(i,2)-childgrid(i,1)+1) * &
                             subgridweights(3-j)/MinimumGridPoints),1)
                     END IF
                     leftmGlobal(i,2)=splitpoint(i,j)
                     rightmGlobal(i,1)=splitpoint(i,j)+1                     
                     HilbertLeft=HilbertValue(leftmGlobal,level)
                     HilbertRight=HilbertValue(rightmGlobal,level)
                     HilbertDiff(i,j)=(-1)**(j-1)*(HilbertRight-HilbertLeft)
                  END IF


                  IF (ngrids == 2) THEN
                     IF (j==1) THEN
                        cost(i,j)=max(ChildAdvanceCost(leftmGlobal, level)/leftweight, ChildAdvanceCost(rightmGlobal,level)/rightweight)
                     ELSE
                        cost(i,j)=max(ChildAdvanceCost(leftmGlobal, level)/rightweight, ChildAdvanceCost(rightmGlobal, level)/leftweight)
                     END IF
                  ELSE
                     IF (j == 1) THEN
                        cost(i,j)=max(ChildAdvanceCost(nint((leftmGlobal)/REAL(i1-indx(1)+1)**(1d0/nDim)),level)/leftweight, ChildAdvanceCost(nint((rightmGlobal)/REAL(indx(2)-i1)**(1d0/nDim)),level)/rightweight)
                     ELSE
                        cost(i,j)=max(ChildAdvanceCost(nint((rightmGlobal)/REAL(i1-indx(1)+1)**(1d0/nDim)),level)/rightweight, ChildAdvanceCost(nint((leftmGlobal)/REAL(indx(2)-i1)**(1d0/nDim)),level)/leftweight)
                     END IF
                  END IF


               END DO
               leftmGlobal(i,2)=childgrid(i,2)
               rightmGlobal(i,1)=childgrid(i,1)

            END DO


            !Rescale HilbertDiffs to go from 0 to +- 1
            HilbertDiff(1:nDim,:)=HilbertDiff(1:nDim,:) / max(maxval(abs(HilbertDiff(1:nDim,:))), 1d-10)
            cost(1:nDim,:)=cost(1:nDim,:)/max(minval(cost(1:nDim,:)),1d-10)
            effective_cost(1:nDim,:)=cost(1:nDim,:)-.00001*HilbertDiff(1:nDim,:)

            !            write(*,'(A,6E15.6)') 'effective_cost=', effective_cost
            !            write(*,'(A,6E15.6)') 'cost =', cost
            !            write(*,'(A,6I15)') 'splitpoints=', splitpoint

            loc=minloc(effective_cost(1:nDim,:))
            i=loc(1)
            j=loc(2)

            IF (splitpoint(i,j) < childgrid(i,1)-1 .OR. splitpoint(i,j) > childgrid(i,2)) THEN
               write(*,*) 'illegal split point', splitpoint, effective_cost, i, j, childgrid
               STOP
            END IF

            leftmGlobal(i,2)=splitpoint(i,j)
            rightmGlobal(i,1)=splitpoint(i,j)+1           

            IF (j==1) THEN            
               subgridleft=leftmGlobal
               subgridright=rightmGlobal
            ELSE
               subgridleft=rightmGlobal
               subgridright=leftmGlobal
            END IF
            !            write(*,'(A,6I4,A,6I4,A,6I4,A,F12.2)') "Split ", childgrid, " into ",subgridleft, " and ", subgridright, "with HDiff=", HilbertDiff(i,j)

            !            write(*,*) indx(1),i1,indx(2)
            IF (ANY(subgridleft(1:ndim,2) < subgridleft(1:nDim,1))) THEN

               IF (leftweight == rightweight) THEN
                  CALL HilbertSplit(subgridright,weights,newgrids,(/indx(1),i1/),level)
                  CALL HilbertSplit(subgridleft,weights,newgrids,(/i1+1,indx(2)/),level)
               ELSE
                  CALL HilbertSplit(subgridright,weights,newgrids,(/i1+1,indx(2)/),level)
                  CALL HilbertSplit(subgridleft,weights,newgrids,(/indx(1),i1/),level)
               END IF
            ELSE
               CALL HilbertSplit(subgridright,weights,newgrids,(/i1+1,indx(2)/),level)
               CALL HilbertSplit(subgridleft,weights,newgrids,(/indx(1),i1/),level)
            END IF
         END IF
      END IF
   END SUBROUTINE HilbertSplit



   !> Clears the Parent Processor lists for a level
   !! @param level level
   SUBROUTINE ClearParentProcs(level)
      INTEGER :: level
      RETURN
      IF (ASSOCIATED(ParentProcs(level)%p)) THEN
         DEALLOCATE(ParentProcs(level)%p)
         NULLIFY(ParentProcs(level)%p)
      END IF

   END SUBROUTINE ClearParentProcs

END MODULE DistributionControl
