timestop_handler Subroutine

public subroutine timestop_handler(handle)

End timer

Arguments

TypeIntentOptionalAttributesName
integer, intent(in) :: handle

Contents

Source Code


Source Code

   SUBROUTINE timestop_handler(handle)
      !! End timer
      INTEGER, INTENT(in)                                :: handle

      CHARACTER(LEN=400)                                 :: line, mystring
      CHARACTER(LEN=60)                                  :: sformat
      INTEGER                                            :: routine_id, stack_size
      INTEGER(KIND=int_8)                                :: cpumem, gpumem_free, gpumem_total
      INTEGER, DIMENSION(2)                              :: routine_tuple
      REAL(KIND=dp)                                      :: en_elapsed, en_now, wt_elapsed, wt_now
      TYPE(call_stat_type), POINTER                      :: c_stat
      TYPE(callstack_entry_type)                         :: cs_entry, prev_cs_entry
      TYPE(routine_stat_type), POINTER                   :: prev_stat, r_stat
      TYPE(timer_env_type), POINTER                      :: timer_env

      routine_id = handle

!$OMP MASTER

#if defined( __CUDA_PROFILING )
      CALL cuda_nvtx_range_pop()
#endif
#if defined( __HIP_PROFILING )
      CALL roctxRangePop()
#endif

      timer_env => list_peek(timers_stack)
      cs_entry = list_pop(timer_env%callstack)
      r_stat => list_get(timer_env%routine_stats, cs_entry%routine_id)

      IF (handle /= cs_entry%routine_id) THEN
         PRINT *, "list_size(timer_env%callstack) ", list_size(timer_env%callstack), &
            " list_size(timers_stack) ", list_size(timers_stack), &
            " got handle ", handle, " expected routineid ", cs_entry%routine_id
         DBCSR_ABORT('mismatched timestop '//TRIM(r_stat%routineN)//' in routine timestop')
      END IF

      wt_elapsed = 0
      en_elapsed = 0
      ! Take timings only when the start time is >=0, i.e. the timings_level is appropriated
      IF (cs_entry%walltime_start .GE. 0) THEN
         wt_now = m_walltime()
         en_now = m_energy()
         ! add the elapsed time for this timeset/timestop to the time accumulator
         wt_elapsed = wt_now - cs_entry%walltime_start
         en_elapsed = en_now - cs_entry%energy_start
      END IF
      r_stat%active_calls = r_stat%active_calls - 1

      ! if we're the last instance in the stack, we do the accounting of the total time
      IF (r_stat%active_calls == 0) THEN
         r_stat%incl_walltime_accu = r_stat%incl_walltime_accu + wt_elapsed
         r_stat%incl_energy_accu = r_stat%incl_energy_accu + en_elapsed
      END IF

      ! exclusive time we always sum, since children will correct this time with their total time
      r_stat%excl_walltime_accu = r_stat%excl_walltime_accu + wt_elapsed
      r_stat%excl_energy_accu = r_stat%excl_energy_accu + en_elapsed

      stack_size = list_size(timer_env%callstack)
      IF (stack_size > 0) THEN
         prev_cs_entry = list_peek(timer_env%callstack)
         prev_stat => list_get(timer_env%routine_stats, prev_cs_entry%routine_id)
         ! we fixup the clock of the caller
         prev_stat%excl_walltime_accu = prev_stat%excl_walltime_accu - wt_elapsed
         prev_stat%excl_energy_accu = prev_stat%excl_energy_accu - en_elapsed

         !update callgraph
         routine_tuple = (/prev_cs_entry%routine_id, routine_id/)
         c_stat => dict_get(timer_env%callgraph, routine_tuple, default_value=Null(c_stat))
         IF (.NOT. ASSOCIATED(c_stat)) THEN
            ALLOCATE (c_stat)
            c_stat%total_calls = 0
            c_stat%incl_walltime_accu = 0.0_dp
            c_stat%incl_energy_accu = 0.0_dp
            CALL dict_set(timer_env%callgraph, routine_tuple, c_stat)
         END IF
         c_stat%total_calls = c_stat%total_calls + 1
         c_stat%incl_walltime_accu = c_stat%incl_walltime_accu + wt_elapsed
         c_stat%incl_energy_accu = c_stat%incl_energy_accu + en_elapsed
      END IF

      !..if debug mode echo the subroutine name
      IF ((timer_env%trace_all .OR. r_stat%trace) .AND. &
          (r_stat%total_calls < timer_env%trace_max)) THEN
         WRITE (sformat, *) "(A,A,", MAX(1, 3*stack_size - 4), "X,I4,1X,I6,1X,A,F12.3)"
         WRITE (mystring, sformat) timer_env%trace_str, "<<", stack_size + 1, &
            r_stat%total_calls, TRIM(r_stat%routineN), wt_elapsed
         CALL acc_devmem_info(gpumem_free, gpumem_total)
         CALL m_memory(cpumem)
         WRITE (line, '(A,A,I0,A,A,I0,A)') TRIM(mystring), &
            " Hostmem: ", (cpumem + 1024*1024 - 1)/(1024*1024), " MB", &
            " GPUmem: ", (gpumem_total - gpumem_free)/(1024*1024), " MB"
         WRITE (timer_env%trace_unit, *) TRIM(line)
         CALL m_flush(timer_env%trace_unit)
      END IF

!$OMP END MASTER

   END SUBROUTINE timestop_handler