Prints collected statistics
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
type(stats_type), | intent(inout) | :: | report | |||
integer, | intent(in) | :: | output_unit |
SUBROUTINE stats_print_report(report, output_unit) !! Prints collected statistics TYPE(stats_type), INTENT(INOUT) :: report INTEGER, INTENT(IN) :: output_unit INTEGER :: i, j INTEGER(KIND=int_8) :: flops, total, total_flops_homo INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: sort_key INTEGER(KIND=int_8), DIMENSION(3) :: flops_homo INTEGER, ALLOCATABLE, DIMENSION(:) :: sort_idx CHARACTER(LEN=4) :: generated_acc_untuned_label LOGICAL :: has_acc_untuned_kernel, & use_cpu_kernels IF (output_unit <= 0) RETURN WRITE (output_unit, "(1X,A,T45,A,T57,A,T68,A,T78,A)") "COUNTER", "TOTAL", "BLAS", "SMM", "ACC" !sorting stat entries by flops per multiplication ALLOCATE (sort_key(SIZE(report%num_mnk_stacks, 1) - 1)) sort_key(:) = 2*PRODUCT(report%num_mnk_stacks(2:, 1:3), DIM=2)*SUM(report%num_mnk_stacks(2:, 4:6), DIM=2) ALLOCATE (sort_idx(SIZE(sort_key))) CALL sort(sort_key, SIZE(sort_key), sort_idx) total_flops_homo = 0 flops_homo(:) = 0 has_acc_untuned_kernel = .FALSE. use_cpu_kernels = .FALSE. DO i = 1, SIZE(sort_idx) j = sort_idx(i) + 1 total = SUM(report%num_mnk_stacks(j, 4:6)) flops = 2*total*PRODUCT(report%num_mnk_stacks(j, 1:3)) total_flops_homo = total_flops_homo + flops flops_homo(:) = flops_homo(:) + 2*report%num_mnk_stacks(j, 4:6)*PRODUCT(report%num_mnk_stacks(j, 1:3)) IF (report%num_mnk_stacks(j, 10) .EQ. 0) THEN generated_acc_untuned_label = "" ELSE generated_acc_untuned_label = "(*)" has_acc_untuned_kernel = .TRUE. END IF IF (SUM(report%num_mnk_stacks(j, 4:5)) .GT. 0) THEN use_cpu_kernels = .TRUE. END IF WRITE (output_unit, "(A,I5,' x ',I5,' x ',I5,T30,I20,5X,F5.1,'%',4X,F5.1,'%',4X,F5.1,'% ',A)") & " flops ", report%num_mnk_stacks(j, 1:3), & flops, & 100*REAL(report%num_mnk_stacks(j, 4:6))/REAL(MAX(INT(1, KIND=int_8), total)), & generated_acc_untuned_label END DO IF (has_acc_untuned_kernel) THEN CALL dbcsr_warn(__LOCATION__, & " (*) ACC Untuned kernels, consider to run the ACC tuning procedure for them") END IF IF (use_cpu_kernels .AND. use_acc()) THEN CALL dbcsr_warn(__LOCATION__, & " Some kernels are running on the CPU, consider to run the ACC tuning procedure for them") END IF total = report%cpu_flop + report%smm_flop + report%acc_flop WRITE (output_unit, "(A,T30,I20,5X,F5.1,'%',4X,F5.1,'%',4X,F5.1,'%')") & " flops inhomo. stacks", total - total_flops_homo, & 100*REAL(report%cpu_flop - flops_homo(1))/REAL(MAX(INT(1, KIND=int_8), total - total_flops_homo)), & 100*REAL(report%smm_flop - flops_homo(2))/REAL(MAX(INT(1, KIND=int_8), total - total_flops_homo)), & 100*REAL(report%acc_flop - flops_homo(3))/REAL(MAX(INT(1, KIND=int_8), total - total_flops_homo)) WRITE (output_unit, "(A,T30,EN20.6,5X,F5.1,'%',4X,F5.1,'%',4X,F5.1,'%')") & " flops total", REAL(total, KIND=real_8), & 100*REAL(report%cpu_flop)/REAL(MAX(INT(1, KIND=int_8), total)), & 100*REAL(report%smm_flop)/REAL(MAX(INT(1, KIND=int_8), total)), & 100*REAL(report%acc_flop)/REAL(MAX(INT(1, KIND=int_8), total)) total = report%max_cpu_flop + report%max_smm_flop + report%max_acc_flop WRITE (output_unit, "(A,T30,EN20.6,5X,F5.1,'%',4X,F5.1,'%',4X,F5.1,'%')") & " flops max/rank", REAL(total, KIND=real_8), & 100*REAL(report%max_cpu_flop)/REAL(MAX(INT(1, KIND=int_8), total)), & 100*REAL(report%max_smm_flop)/REAL(MAX(INT(1, KIND=int_8), total)), & 100*REAL(report%max_acc_flop)/REAL(MAX(INT(1, KIND=int_8), total)) total = SUM(report%num_mnk_stacks(1, 4:6)) WRITE (output_unit, "(A,T30,I20,5X,F5.1,'%',4X,F5.1,'%',4X,F5.1,'%')") & " matmuls inhomo. stacks", total, & 100*REAL(report%num_mnk_stacks(1, 4:6))/REAL(MAX(INT(1, KIND=int_8), total)) total = SUM(report%num_mnk_stacks(:, 4:6)) WRITE (output_unit, "(A,T30,I20,5X,F5.1,'%',4X,F5.1,'%',4X,F5.1,'%')") & " matmuls total", total, & 100*REAL(SUM(report%num_mnk_stacks(:, 4:6), DIM=1))/REAL(MAX(INT(1, KIND=int_8), total)) total = report%cpu_num_stacks + report%smm_num_stacks + report%acc_num_stacks WRITE (output_unit, "(A,T30,I20,5X,F5.1,'%',4X,F5.1,'%',4X,F5.1,'%')") & " number of processed stacks", total, & 100*REAL(report%cpu_num_stacks)/REAL(MAX(INT(1, KIND=int_8), total)), & 100*REAL(report%smm_num_stacks)/REAL(MAX(INT(1, KIND=int_8), total)), & 100*REAL(report%acc_num_stacks)/REAL(MAX(INT(1, KIND=int_8), total)) WRITE (output_unit, '(A,T51,F9.1,1X,F9.1,1X,F9.1)') " average stack size", & REAL(SUM(report%num_mnk_stacks(:, 4)))/REAL(MAX(INT(1, KIND=int_8), SUM(report%num_mnk_stacks(:, 7)))), & REAL(SUM(report%num_mnk_stacks(:, 5)))/REAL(MAX(INT(1, KIND=int_8), SUM(report%num_mnk_stacks(:, 8)))), & REAL(SUM(report%num_mnk_stacks(:, 6)))/REAL(MAX(INT(1, KIND=int_8), SUM(report%num_mnk_stacks(:, 9)))) END SUBROUTINE stats_print_report