stats_print_report Subroutine

private subroutine stats_print_report(report, output_unit)

Prints collected statistics

Arguments

Type IntentOptional Attributes Name
type(stats_type), intent(inout) :: report
integer, intent(in) :: output_unit

Source Code

   SUBROUTINE stats_print_report(report, output_unit)
      !! Prints collected statistics
      TYPE(stats_type), INTENT(INOUT)                    :: report
      INTEGER, INTENT(IN)                                :: output_unit

      INTEGER                                            :: i, j
      INTEGER(KIND=int_8)                                :: flops, total, total_flops_homo
      INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:)     :: sort_key
      INTEGER(KIND=int_8), DIMENSION(3)                  :: flops_homo
      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: sort_idx
      CHARACTER(LEN=4)                                   :: generated_acc_untuned_label
      LOGICAL                                            :: has_acc_untuned_kernel, &
                                                            use_cpu_kernels

      IF (output_unit <= 0) RETURN

      WRITE (output_unit, "(1X,A,T45,A,T57,A,T68,A,T78,A)") "COUNTER", "TOTAL", "BLAS", "SMM", "ACC"

      !sorting stat entries by flops per multiplication
      ALLOCATE (sort_key(SIZE(report%num_mnk_stacks, 1) - 1))
      sort_key(:) = 2*PRODUCT(report%num_mnk_stacks(2:, 1:3), DIM=2)*SUM(report%num_mnk_stacks(2:, 4:6), DIM=2)
      ALLOCATE (sort_idx(SIZE(sort_key)))
      CALL sort(sort_key, SIZE(sort_key), sort_idx)

      total_flops_homo = 0
      flops_homo(:) = 0
      has_acc_untuned_kernel = .FALSE.
      use_cpu_kernels = .FALSE.

      DO i = 1, SIZE(sort_idx)
         j = sort_idx(i) + 1
         total = SUM(report%num_mnk_stacks(j, 4:6))
         flops = 2*total*PRODUCT(report%num_mnk_stacks(j, 1:3))
         total_flops_homo = total_flops_homo + flops
         flops_homo(:) = flops_homo(:) + 2*report%num_mnk_stacks(j, 4:6)*PRODUCT(report%num_mnk_stacks(j, 1:3))
         IF (report%num_mnk_stacks(j, 10) .EQ. 0) THEN
            generated_acc_untuned_label = ""
         ELSE
            generated_acc_untuned_label = "(*)"
            has_acc_untuned_kernel = .TRUE.
         END IF

         IF (SUM(report%num_mnk_stacks(j, 4:5)) .GT. 0) THEN
            use_cpu_kernels = .TRUE.
         END IF

         WRITE (output_unit, "(A,I5,' x ',I5,' x ',I5,T30,I20,5X,F5.1,'%',4X,F5.1,'%',4X,F5.1,'% ',A)") &
            " flops ", report%num_mnk_stacks(j, 1:3), &
            flops, &
            100*REAL(report%num_mnk_stacks(j, 4:6))/REAL(MAX(INT(1, KIND=int_8), total)), &
            generated_acc_untuned_label
      END DO

      IF (has_acc_untuned_kernel) THEN
         CALL dbcsr_warn(__LOCATION__, &
                         " (*) ACC Untuned kernels, consider to run the ACC tuning procedure for them")
      END IF

      IF (use_cpu_kernels .AND. use_acc()) THEN
         CALL dbcsr_warn(__LOCATION__, &
                         " Some kernels are running on the CPU, consider to run the ACC tuning procedure for them")
      END IF

      total = report%cpu_flop + report%smm_flop + report%acc_flop
      WRITE (output_unit, "(A,T30,I20,5X,F5.1,'%',4X,F5.1,'%',4X,F5.1,'%')") &
         " flops inhomo. stacks", total - total_flops_homo, &
         100*REAL(report%cpu_flop - flops_homo(1))/REAL(MAX(INT(1, KIND=int_8), total - total_flops_homo)), &
         100*REAL(report%smm_flop - flops_homo(2))/REAL(MAX(INT(1, KIND=int_8), total - total_flops_homo)), &
         100*REAL(report%acc_flop - flops_homo(3))/REAL(MAX(INT(1, KIND=int_8), total - total_flops_homo))

      WRITE (output_unit, "(A,T30,EN20.6,5X,F5.1,'%',4X,F5.1,'%',4X,F5.1,'%')") &
         " flops total", REAL(total, KIND=real_8), &
         100*REAL(report%cpu_flop)/REAL(MAX(INT(1, KIND=int_8), total)), &
         100*REAL(report%smm_flop)/REAL(MAX(INT(1, KIND=int_8), total)), &
         100*REAL(report%acc_flop)/REAL(MAX(INT(1, KIND=int_8), total))

      total = report%max_cpu_flop + report%max_smm_flop + report%max_acc_flop
      WRITE (output_unit, "(A,T30,EN20.6,5X,F5.1,'%',4X,F5.1,'%',4X,F5.1,'%')") &
         " flops max/rank", REAL(total, KIND=real_8), &
         100*REAL(report%max_cpu_flop)/REAL(MAX(INT(1, KIND=int_8), total)), &
         100*REAL(report%max_smm_flop)/REAL(MAX(INT(1, KIND=int_8), total)), &
         100*REAL(report%max_acc_flop)/REAL(MAX(INT(1, KIND=int_8), total))

      total = SUM(report%num_mnk_stacks(1, 4:6))
      WRITE (output_unit, "(A,T30,I20,5X,F5.1,'%',4X,F5.1,'%',4X,F5.1,'%')") &
         " matmuls inhomo. stacks", total, &
         100*REAL(report%num_mnk_stacks(1, 4:6))/REAL(MAX(INT(1, KIND=int_8), total))

      total = SUM(report%num_mnk_stacks(:, 4:6))
      WRITE (output_unit, "(A,T30,I20,5X,F5.1,'%',4X,F5.1,'%',4X,F5.1,'%')") &
         " matmuls total", total, &
         100*REAL(SUM(report%num_mnk_stacks(:, 4:6), DIM=1))/REAL(MAX(INT(1, KIND=int_8), total))

      total = report%cpu_num_stacks + report%smm_num_stacks + report%acc_num_stacks
      WRITE (output_unit, "(A,T30,I20,5X,F5.1,'%',4X,F5.1,'%',4X,F5.1,'%')") &
         " number of processed stacks", total, &
         100*REAL(report%cpu_num_stacks)/REAL(MAX(INT(1, KIND=int_8), total)), &
         100*REAL(report%smm_num_stacks)/REAL(MAX(INT(1, KIND=int_8), total)), &
         100*REAL(report%acc_num_stacks)/REAL(MAX(INT(1, KIND=int_8), total))

      WRITE (output_unit, '(A,T51,F9.1,1X,F9.1,1X,F9.1)') " average stack size", &
         REAL(SUM(report%num_mnk_stacks(:, 4)))/REAL(MAX(INT(1, KIND=int_8), SUM(report%num_mnk_stacks(:, 7)))), &
         REAL(SUM(report%num_mnk_stacks(:, 5)))/REAL(MAX(INT(1, KIND=int_8), SUM(report%num_mnk_stacks(:, 8)))), &
         REAL(SUM(report%num_mnk_stacks(:, 6)))/REAL(MAX(INT(1, KIND=int_8), SUM(report%num_mnk_stacks(:, 9))))

   END SUBROUTINE stats_print_report