End timer
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
integer, | intent(in) | :: | handle |
SUBROUTINE timestop_handler(handle)
!! End timer
INTEGER, INTENT(in) :: handle
CHARACTER(LEN=400) :: line, mystring
CHARACTER(LEN=60) :: sformat
INTEGER :: routine_id, stack_size
INTEGER(KIND=int_8) :: cpumem, gpumem_free, gpumem_total
INTEGER, DIMENSION(2) :: routine_tuple
REAL(KIND=dp) :: en_elapsed, en_now, wt_elapsed, wt_now
TYPE(call_stat_type), POINTER :: c_stat
TYPE(callstack_entry_type) :: cs_entry, prev_cs_entry
TYPE(routine_stat_type), POINTER :: prev_stat, r_stat
TYPE(timer_env_type), POINTER :: timer_env
routine_id = handle
!$OMP MASTER
#if defined( __CUDA_PROFILING )
CALL cuda_nvtx_range_pop()
#endif
#if defined( __HIP_PROFILING )
CALL roctxRangePop()
#endif
timer_env => list_peek(timers_stack)
cs_entry = list_pop(timer_env%callstack)
r_stat => list_get(timer_env%routine_stats, cs_entry%routine_id)
IF (handle /= cs_entry%routine_id) THEN
PRINT *, "list_size(timer_env%callstack) ", list_size(timer_env%callstack), &
" list_size(timers_stack) ", list_size(timers_stack), &
" got handle ", handle, " expected routineid ", cs_entry%routine_id
DBCSR_ABORT('mismatched timestop '//TRIM(r_stat%routineN)//' in routine timestop')
END IF
wt_elapsed = 0
en_elapsed = 0
! Take timings only when the start time is >=0, i.e. the timings_level is appropriated
IF (cs_entry%walltime_start .GE. 0) THEN
wt_now = m_walltime()
en_now = m_energy()
! add the elapsed time for this timeset/timestop to the time accumulator
wt_elapsed = wt_now - cs_entry%walltime_start
en_elapsed = en_now - cs_entry%energy_start
END IF
r_stat%active_calls = r_stat%active_calls - 1
! if we're the last instance in the stack, we do the accounting of the total time
IF (r_stat%active_calls == 0) THEN
r_stat%incl_walltime_accu = r_stat%incl_walltime_accu + wt_elapsed
r_stat%incl_energy_accu = r_stat%incl_energy_accu + en_elapsed
END IF
! exclusive time we always sum, since children will correct this time with their total time
r_stat%excl_walltime_accu = r_stat%excl_walltime_accu + wt_elapsed
r_stat%excl_energy_accu = r_stat%excl_energy_accu + en_elapsed
stack_size = list_size(timer_env%callstack)
IF (stack_size > 0) THEN
prev_cs_entry = list_peek(timer_env%callstack)
prev_stat => list_get(timer_env%routine_stats, prev_cs_entry%routine_id)
! we fixup the clock of the caller
prev_stat%excl_walltime_accu = prev_stat%excl_walltime_accu - wt_elapsed
prev_stat%excl_energy_accu = prev_stat%excl_energy_accu - en_elapsed
!update callgraph
routine_tuple = (/prev_cs_entry%routine_id, routine_id/)
c_stat => dict_get(timer_env%callgraph, routine_tuple, default_value=Null(c_stat))
IF (.NOT. ASSOCIATED(c_stat)) THEN
ALLOCATE (c_stat)
c_stat%total_calls = 0
c_stat%incl_walltime_accu = 0.0_dp
c_stat%incl_energy_accu = 0.0_dp
CALL dict_set(timer_env%callgraph, routine_tuple, c_stat)
END IF
c_stat%total_calls = c_stat%total_calls + 1
c_stat%incl_walltime_accu = c_stat%incl_walltime_accu + wt_elapsed
c_stat%incl_energy_accu = c_stat%incl_energy_accu + en_elapsed
END IF
!..if debug mode echo the subroutine name
IF ((timer_env%trace_all .OR. r_stat%trace) .AND. &
(r_stat%total_calls < timer_env%trace_max)) THEN
WRITE (sformat, *) "(A,A,", MAX(1, 3*stack_size - 4), "X,I4,1X,I6,1X,A,F12.3)"
WRITE (mystring, sformat) timer_env%trace_str, "<<", stack_size + 1, &
r_stat%total_calls, TRIM(r_stat%routineN), wt_elapsed
CALL acc_devmem_info(gpumem_free, gpumem_total)
CALL m_memory(cpumem)
WRITE (line, '(A,A,I0,A,A,I0,A)') TRIM(mystring), &
" Hostmem: ", (cpumem + 1024*1024 - 1)/(1024*1024), " MB", &
" GPUmem: ", (gpumem_total - gpumem_free)/(1024*1024), " MB"
WRITE (timer_env%trace_unit, *) TRIM(line)
CALL m_flush(timer_env%trace_unit)
END IF
!$OMP END MASTER
END SUBROUTINE timestop_handler