End timer
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
integer, | intent(in) | :: | handle |
SUBROUTINE timestop_handler(handle) !! End timer INTEGER, INTENT(in) :: handle CHARACTER(LEN=400) :: line, mystring CHARACTER(LEN=60) :: sformat INTEGER :: routine_id, stack_size INTEGER(KIND=int_8) :: cpumem, gpumem_free, gpumem_total INTEGER, DIMENSION(2) :: routine_tuple REAL(KIND=dp) :: en_elapsed, en_now, wt_elapsed, wt_now TYPE(call_stat_type), POINTER :: c_stat TYPE(callstack_entry_type) :: cs_entry, prev_cs_entry TYPE(routine_stat_type), POINTER :: prev_stat, r_stat TYPE(timer_env_type), POINTER :: timer_env routine_id = handle !$OMP MASTER #if defined( __CUDA_PROFILING ) CALL cuda_nvtx_range_pop() #endif #if defined( __HIP_PROFILING ) CALL roctxRangePop() #endif timer_env => list_peek(timers_stack) cs_entry = list_pop(timer_env%callstack) r_stat => list_get(timer_env%routine_stats, cs_entry%routine_id) IF (handle /= cs_entry%routine_id) THEN PRINT *, "list_size(timer_env%callstack) ", list_size(timer_env%callstack), & " list_size(timers_stack) ", list_size(timers_stack), & " got handle ", handle, " expected routineid ", cs_entry%routine_id DBCSR_ABORT('mismatched timestop '//TRIM(r_stat%routineN)//' in routine timestop') END IF wt_elapsed = 0 en_elapsed = 0 ! Take timings only when the start time is >=0, i.e. the timings_level is appropriated IF (cs_entry%walltime_start .GE. 0) THEN wt_now = m_walltime() en_now = m_energy() ! add the elapsed time for this timeset/timestop to the time accumulator wt_elapsed = wt_now - cs_entry%walltime_start en_elapsed = en_now - cs_entry%energy_start END IF r_stat%active_calls = r_stat%active_calls - 1 ! if we're the last instance in the stack, we do the accounting of the total time IF (r_stat%active_calls == 0) THEN r_stat%incl_walltime_accu = r_stat%incl_walltime_accu + wt_elapsed r_stat%incl_energy_accu = r_stat%incl_energy_accu + en_elapsed END IF ! exclusive time we always sum, since children will correct this time with their total time r_stat%excl_walltime_accu = r_stat%excl_walltime_accu + wt_elapsed r_stat%excl_energy_accu = r_stat%excl_energy_accu + en_elapsed stack_size = list_size(timer_env%callstack) IF (stack_size > 0) THEN prev_cs_entry = list_peek(timer_env%callstack) prev_stat => list_get(timer_env%routine_stats, prev_cs_entry%routine_id) ! we fixup the clock of the caller prev_stat%excl_walltime_accu = prev_stat%excl_walltime_accu - wt_elapsed prev_stat%excl_energy_accu = prev_stat%excl_energy_accu - en_elapsed !update callgraph routine_tuple = (/prev_cs_entry%routine_id, routine_id/) c_stat => dict_get(timer_env%callgraph, routine_tuple, default_value=Null(c_stat)) IF (.NOT. ASSOCIATED(c_stat)) THEN ALLOCATE (c_stat) c_stat%total_calls = 0 c_stat%incl_walltime_accu = 0.0_dp c_stat%incl_energy_accu = 0.0_dp CALL dict_set(timer_env%callgraph, routine_tuple, c_stat) END IF c_stat%total_calls = c_stat%total_calls + 1 c_stat%incl_walltime_accu = c_stat%incl_walltime_accu + wt_elapsed c_stat%incl_energy_accu = c_stat%incl_energy_accu + en_elapsed END IF !..if debug mode echo the subroutine name IF ((timer_env%trace_all .OR. r_stat%trace) .AND. & (r_stat%total_calls < timer_env%trace_max)) THEN WRITE (sformat, *) "(A,A,", MAX(1, 3*stack_size - 4), "X,I4,1X,I6,1X,A,F12.3)" WRITE (mystring, sformat) timer_env%trace_str, "<<", stack_size + 1, & r_stat%total_calls, TRIM(r_stat%routineN), wt_elapsed CALL acc_devmem_info(gpumem_free, gpumem_total) CALL m_memory(cpumem) WRITE (line, '(A,A,I0,A,A,I0,A)') TRIM(mystring), & " Hostmem: ", (cpumem + 1024*1024 - 1)/(1024*1024), " MB", & " GPUmem: ", (gpumem_total - gpumem_free)/(1024*1024), " MB" WRITE (timer_env%trace_unit, *) TRIM(line) CALL m_flush(timer_env%trace_unit) END IF !$OMP END MASTER END SUBROUTINE timestop_handler