dbcsr_mm_cannon.F Source File


Source Code

# 1 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F" 1
!--------------------------------------------------------------------------------------------------!
! Copyright (C) by the DBCSR developers group - All rights reserved                                !
! This file is part of the DBCSR library.                                                          !
!                                                                                                  !
! For information on the license, see the LICENSE file.                                            !
! For further information please visit https://dbcsr.cp2k.org                                      !
! SPDX-License-Identifier: GPL-2.0+                                                                !
!--------------------------------------------------------------------------------------------------!

MODULE dbcsr_mm_cannon
   !! First layer of the dbcsr matrix-matrix multiplication.
   !! It performs the MPI parallelization according to Cannon's algorithm.
   !! <b>Modification history:</b>
   !! - 2010-02-23 Moved from dbcsr_operations
   !! - 2011-11    Moved parameter-stack processing routines to
   !! dbcsr_mm_methods.
   !! - 2013-01    reorganized code (Ole Schuett)

   USE dbcsr_acc_event, ONLY: acc_event_synchronize
   USE dbcsr_acc_device, ONLY: acc_device_synchronize
   USE dbcsr_array_types, ONLY: array_data, &
                                array_exists, &
                                array_i1d_obj, &
                                array_size
   USE dbcsr_block_operations, ONLY: dbcsr_block_conjg, &
                                     dbcsr_block_copy_aa, &
                                     dbcsr_block_real_neg, &
                                     dbcsr_block_scale, &
                                     dbcsr_block_transpose_aa, &
                                     dbcsr_block_transpose
   USE dbcsr_config, ONLY: dbcsr_cfg, &
                           has_acc
   USE dbcsr_data_methods, ONLY: &
      dbcsr_data_clear_pointer, dbcsr_data_ensure_size, dbcsr_data_get_size, &
      dbcsr_data_get_size_referenced, dbcsr_data_hold, dbcsr_data_host2dev, dbcsr_data_init, &
      dbcsr_data_new, dbcsr_data_release, dbcsr_data_set_pointer, &
      dbcsr_data_set_size_referenced, dbcsr_scalar_are_equal, dbcsr_scalar_negative, &
      dbcsr_scalar_one, dbcsr_get_data_p_s, dbcsr_get_data_p_d, dbcsr_get_data_p_c, dbcsr_get_data_p_z
   USE dbcsr_data_types, ONLY: dbcsr_datatype_sizeof
   USE dbcsr_dist_methods, ONLY: dbcsr_distribution_col_dist, &
                                 dbcsr_distribution_has_threads, &
                                 dbcsr_distribution_max_col_dist, &
                                 dbcsr_distribution_max_row_dist, &
                                 dbcsr_distribution_mp, &
                                 dbcsr_distribution_row_dist
   USE dbcsr_index_operations, ONLY: dbcsr_count_row_index, &
                                     dbcsr_has_local_row_index, &
                                     dbcsr_index_prune_deleted, &
                                     dbcsr_make_index_list, &
                                     dbcsr_make_index_local_row, &
                                     dbcsr_repoint_index
   USE dbcsr_io, ONLY: dbcsr_print
   USE dbcsr_iterator_operations, ONLY: dbcsr_iterator_blocks_left, &
                                        dbcsr_iterator_next_block, &
                                        dbcsr_iterator_start, &
                                        dbcsr_iterator_stop
   USE dbcsr_mem_methods, ONLY: dbcsr_mempool_limit_capacity
   USE dbcsr_methods, ONLY: &
      dbcsr_destroy_array, dbcsr_distribution, dbcsr_get_data_type, dbcsr_get_index_memory_type, &
      dbcsr_has_symmetry, dbcsr_image_dist_hold, dbcsr_image_dist_init, &
      dbcsr_image_dist_release, dbcsr_max_col_size, dbcsr_max_row_size, dbcsr_nblkcols_local, &
      dbcsr_nblkrows_local, dbcsr_nblkrows_total, dbcsr_release, dbcsr_valid_index
   USE dbcsr_mm_common, ONLY: &
      acc_transpose_blocks, calculate_norms, count_mpi_statistics, dbcsr_mm_multrec_type_p, &
      dbcsr_mpi_statistics, enumerate_blk_sizes, huge_norm, local_filter, max_memory, &
      memtype_abpanel_1, memtype_abpanel_2, memtype_mpi_buffer, memtype_trsbuffer_1, &
      memtype_trsbuffer_2, product_matrix_size_guess, rec_sort_index, setup_buffer_matrix
   USE dbcsr_mm_dist_operations, ONLY: dbcsr_get_local_vcols, &
                                       dbcsr_get_local_vrows, &
                                       dbcsr_reset_vlocals, &
                                       image_calculator
   USE dbcsr_mm_multrec, ONLY: dbcsr_mm_multrec_finalize, &
                               dbcsr_mm_multrec_init, &
                               dbcsr_mm_multrec_multiply
   USE dbcsr_mp_methods, ONLY: &
      dbcsr_mp_grid_setup, dbcsr_mp_group, dbcsr_mp_has_subgroups, dbcsr_mp_my_col_group, &
      dbcsr_mp_my_row_group, dbcsr_mp_mynode, dbcsr_mp_mypcol, dbcsr_mp_myprow, dbcsr_mp_npcols, &
      dbcsr_mp_nprows, dbcsr_mp_numnodes, dbcsr_mp_pgrid
   USE dbcsr_mp_operations, ONLY: dbcsr_irecv_any, &
                                  dbcsr_isend_any, &
                                  hybrid_alltoall_any, &
                                  hybrid_alltoall_i1
   USE dbcsr_operations, ONLY: dbcsr_copy, &
                               dbcsr_crop_matrix
   USE dbcsr_ptr_util, ONLY: ensure_array_size
   USE dbcsr_transformations, ONLY: dbcsr_make_dense_low
   USE dbcsr_types, ONLY: &
      dbcsr_2d_array_type, dbcsr_data_obj, dbcsr_distribution_obj, dbcsr_imagedistribution_obj, &
      dbcsr_iterator, dbcsr_memtype_type, dbcsr_meta_size, dbcsr_mp_obj, dbcsr_scalar_type, &
      dbcsr_slot_home_coli, dbcsr_slot_home_pcol, dbcsr_slot_home_prow, dbcsr_slot_home_rowi, &
      dbcsr_slot_home_vpcol, dbcsr_slot_home_vprow, dbcsr_slot_size, dbcsr_type, &
      dbcsr_type_complex_4, dbcsr_type_complex_8, dbcsr_type_int_4, dbcsr_type_no_symmetry, &
      dbcsr_type_real_4, dbcsr_type_real_8
   USE dbcsr_dist_util, ONLY: count_bins, &
                              dbcsr_checksum
   USE dbcsr_work_operations, ONLY: dbcsr_create, &
                                    dbcsr_special_finalize, &
                                    dbcsr_work_create
   USE dbcsr_kinds, ONLY: dp, &
                          int_8, &
                          real_4, &
                          real_8, &
                          sp
   USE dbcsr_machine, ONLY: default_output_unit, &
                            m_memory
   USE dbcsr_mpiwrap, ONLY: mp_allgather, &
                            mp_alltoall, &
                            mp_irecv, &
                            mp_isend, &
                            mp_request_null, &
                            mp_sum, &
                            mp_testany, &
                            mp_waitall, mp_comm_type, mp_request_type
#include "base/dbcsr_base_uses.f90"

!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads

   IMPLICIT NONE

   PRIVATE

   CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'dbcsr_mm_cannon'
   LOGICAL, PARAMETER :: debug_mod = .FALSE.
   LOGICAL, PARAMETER :: careful_mod = .FALSE.

   INTERFACE dbcsr_switch
      MODULE PROCEDURE dbcsr_switch_sets
      MODULE PROCEDURE dbcsr_switch_d_ptrs
   END INTERFACE

   PUBLIC :: multiply_cannon, make_m2s

CONTAINS

   SUBROUTINE make_m2s(matrix, m2s, rdist, dense_rdist, &
      !! Make images from the matrix (left or right)
                       use_dense_mult, ab_dense, predistribute, &
                       f_k, l_k, f_row, l_row, f_col, l_col, &
                       row_blk_size, col_blk_size, &
                       k_vmap, m_map, n_map, &
                       alpha)
      TYPE(dbcsr_type), INTENT(IN)                       :: matrix
      TYPE(dbcsr_2d_array_type), INTENT(OUT), POINTER    :: m2s
      TYPE(dbcsr_imagedistribution_obj), INTENT(INOUT)   :: rdist, dense_rdist
      LOGICAL, INTENT(IN)                                :: use_dense_mult, ab_dense
      CHARACTER, INTENT(IN)                              :: predistribute
      INTEGER, INTENT(IN)                                :: f_k, l_k, f_row, l_row, f_col, l_col
      TYPE(array_i1d_obj), INTENT(INOUT)                 :: row_blk_size, col_blk_size
      TYPE(array_i1d_obj), INTENT(IN)                    :: k_vmap, m_map, n_map
      TYPE(dbcsr_scalar_type), INTENT(IN), OPTIONAL      :: alpha

      CHARACTER(len=*), PARAMETER :: routineN = 'make_m2s'
      INTEGER                                            :: handle, i, im, j, jm
      INTEGER, DIMENSION(4)                              :: f_crop
      LOGICAL                                            :: do_scale, thread_redist
      TYPE(array_i1d_obj)                                :: col_map, row_map
      TYPE(dbcsr_type)                                   :: dense_template, matrix_tmp

      CALL timeset(routineN, handle)
      ALLOCATE (m2s)
      do_scale = .FALSE.
      IF (PRESENT(alpha)) THEN
         IF (.NOT. dbcsr_scalar_are_equal(alpha, dbcsr_scalar_one(alpha%data_type))) THEN
            do_scale = .TRUE.
         END IF
      END IF

      IF (do_scale) THEN
         ! Copy and scale matrix if alpha is not 1.
         CALL dbcsr_make_images(matrix, m2s, rdist, &
                                predistribute=predistribute, &
                                no_copy_data=use_dense_mult, scale_value=alpha)
      ELSE
         CALL dbcsr_make_images(matrix, m2s, rdist, &
                                predistribute=predistribute, &
                                no_copy_data=use_dense_mult)
      END IF

      im = SIZE(m2s%mats, 1)
      jm = SIZE(m2s%mats, 2)
      SELECT CASE (predistribute)
      CASE ('L')
         f_crop = (/f_row, l_row, f_k, l_k/)
         row_map = m_map
         col_map = k_vmap
         thread_redist = .TRUE.
      CASE default
         f_crop = (/f_k, l_k, f_col, l_col/)
         row_map = k_vmap
         col_map = n_map
         thread_redist = .FALSE.
      END SELECT

      ! Post-processing of images.
      DO i = 1, im
         DO j = 1, jm
            CALL dbcsr_reset_vlocals(m2s%mats(i, j), rdist)
            ! Crop if necessary
            IF (ANY(f_crop .NE. 0)) THEN
               matrix_tmp = dbcsr_type()
               CALL dbcsr_crop_matrix(matrix_tmp, m2s%mats(i, j), &
                                      full_row_bounds=f_crop(1:2), &
                                      full_column_bounds=f_crop(3:4), &
                                      shallow_data=.FALSE.)
               CALL dbcsr_release(m2s%mats(i, j))
               CALL dbcsr_copy(m2s%mats(i, j), matrix_tmp, shallow_data=.TRUE.)
               CALL dbcsr_release(matrix_tmp)
               CALL dbcsr_reset_vlocals(m2s%mats(i, j), rdist)
            END IF
         END DO
      END DO

      IF (ab_dense) THEN
         dense_template = dbcsr_type()
         CALL dbcsr_create(dense_template, template=matrix, &
                           dist=dense_rdist%i%main, &
                           row_blk_size_obj=row_blk_size, col_blk_size_obj=col_blk_size)
         CALL dbcsr_make_images_dense(m2s, dense_rdist, &
                                      row_map=row_map, col_map=col_map, &
                                      join_cols=use_dense_mult, join_rows=ab_dense, &
                                      new_template=dense_template)
         CALL dbcsr_image_dist_release(rdist)
         rdist = dense_rdist
         CALL dbcsr_image_dist_hold(rdist)
         DO i = 1, im
            DO j = 1, jm
               CALL dbcsr_reset_vlocals(m2s%mats(i, j), rdist)
            END DO
         END DO
      END IF

      DO i = 1, im
         DO j = 1, jm
            ! Convert to local-row index
            CALL dbcsr_make_index_local_row(m2s%mats(i, j))
            ! Convert to list index
            CALL dbcsr_make_index_list(m2s%mats(i, j), thread_redist=thread_redist)
         END DO
      END DO

      IF (ab_dense) THEN
         CALL dbcsr_image_dist_release(dense_rdist)
         CALL dbcsr_release(dense_template)
      END IF

      CALL timestop(handle)
   END SUBROUTINE make_m2s

   SUBROUTINE dbcsr_make_images(source, normalized, target_image_dist, &
                                predistribute, no_copy_data, scale_value)
      !! Creates row and column images of a matrix.

      TYPE(dbcsr_type), INTENT(IN)                       :: source
         !! input matrix
      TYPE(dbcsr_2d_array_type), INTENT(OUT)             :: normalized
         !! image array of the normalized matrix
      TYPE(dbcsr_imagedistribution_obj), INTENT(IN)      :: target_image_dist
         !! normalize to this image distribution
      CHARACTER, INTENT(IN), OPTIONAL                    :: predistribute
         !! predistribute data for multiplication
      LOGICAL, INTENT(IN), OPTIONAL                      :: no_copy_data
         !! try to not merge data at the end
      TYPE(dbcsr_scalar_type), INTENT(IN), OPTIONAL      :: scale_value
         !! scale with this value

!   ---------------------------------------------------------------------------

      IF (dbcsr_cfg%use_mpi_rma%val) &
         DBCSR_ABORT("RMA algo not supported here!")
      IF (.NOT. dbcsr_valid_index(source)) &
         DBCSR_ABORT("Matrix not initialized.")
      CALL make_images(source, normalized, &
                       target_image_dist, desymmetrize=dbcsr_has_symmetry(source), &
                       predistribute=predistribute, &
                       no_copy_data=no_copy_data, &
                       scale_value=scale_value)
      normalized%image_dist = target_image_dist
      CALL dbcsr_image_dist_hold(normalized%image_dist)
   END SUBROUTINE dbcsr_make_images

   SUBROUTINE make_images(ism, ums, target_imgdist, desymmetrize, predistribute, &
                          no_copy_data, scale_value)
      !! Makes column-based and row-based images of a matrix.

      TYPE(dbcsr_type), INTENT(IN)                       :: ism
         !! input symmetric matrix
      TYPE(dbcsr_2d_array_type), INTENT(OUT)             :: ums
         !! normalized matrices
      TYPE(dbcsr_imagedistribution_obj), INTENT(IN)      :: target_imgdist
         !! image distribution to normalize to
      LOGICAL, INTENT(IN), OPTIONAL                      :: desymmetrize
         !! desymmetrize a symmetric matrix
      CHARACTER, INTENT(IN), OPTIONAL                    :: predistribute
         !! predistribute data for multiplication
      LOGICAL, INTENT(IN), OPTIONAL                      :: no_copy_data
         !! try to not merge data at the end
      TYPE(dbcsr_scalar_type), INTENT(IN), OPTIONAL      :: scale_value
         !! scale with this value

      CHARACTER(len=*), PARAMETER :: routineN = 'make_images'
      INTEGER, PARAMETER                                 :: metalen = 5
      LOGICAL, PARAMETER                                 :: dbg = .FALSE.

      CHARACTER                                          :: predist_type, predist_type_fwd
      INTEGER :: blk, blk_l, blk_p, bp, col, col_img, col_size, coli, data_type, dst_p, handle, &
                 handle2, ithread, ncol_images, nrow_images, nsymmetries, nthreads, numproc, &
                 nze, pcol, prow, row, row_img, row_size, rowi, sd_pos, sm_pos, src_p, stored_blk_p, &
                 stored_col, stored_row, symmetry_i, tr_col_size, tr_row_size, vcol, vrow
      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: myt_sdp, myt_smp, rd_disp, recv_meta, &
                                                            rm_disp, sd_disp, sdp, send_meta, &
                                                            sm_disp, smp
      INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: all_total_send_offset, blk_ps, blks, &
                                                            myt_total_send_count, &
                                                            total_recv_count, total_send_count
      INTEGER, ALLOCATABLE, DIMENSION(:, :, :, :)        :: myt_send_count, recv_count, send_count
      INTEGER, DIMENSION(:), CONTIGUOUS, POINTER         :: col_blk_size, col_dist, col_img_dist, &
                                                            row_blk_size, row_dist, row_img_dist
      INTEGER, DIMENSION(:, :), CONTIGUOUS, POINTER      :: blacs2mpi
      LOGICAL                                            :: nocopy, tr
      TYPE(dbcsr_data_obj)                               :: received_data_area, recv_data_area, &
                                                            send_data_area
      TYPE(dbcsr_distribution_obj)                       :: old_dist, target_dist
      TYPE(dbcsr_iterator)                               :: iter
      TYPE(dbcsr_memtype_type)                           :: data_memory_type
      TYPE(dbcsr_mp_obj)                                 :: mp_obj
      TYPE(dbcsr_scalar_type)                            :: scale_neg_one
      TYPE(dbcsr_type)                                   :: sm
      TYPE(mp_comm_type)                                 :: mp_group

!   ---------------------------------------------------------------------------
! Check input matrix
! Set convenient variables to access input matrix info
!

      CALL timeset(routineN, handle)
      nocopy = .FALSE.
      IF (PRESENT(no_copy_data)) nocopy = no_copy_data
      sm = ism
      nsymmetries = 1
      IF (PRESENT(desymmetrize)) THEN
         IF (desymmetrize .AND. sm%symmetry) THEN
            nsymmetries = 2
         END IF
      END IF
      SELECT CASE (predistribute)
      CASE ('L', 'l')
         predist_type = 'L'
         predist_type_fwd = 'l'
      CASE ('R', 'r')
         predist_type = 'R'
         predist_type_fwd = 'r'
      CASE default
         DBCSR_ABORT("Incorrect pre-shift specifier.")
      END SELECT
      data_type = sm%data_type
      IF (data_type .NE. dbcsr_type_real_8 .AND. &
          data_type .NE. dbcsr_type_real_4 .AND. &
          data_type .NE. dbcsr_type_complex_8 .AND. &
          data_type .NE. dbcsr_type_complex_4) &
         DBCSR_ABORT("Invalid data type.")
      scale_neg_one = dbcsr_scalar_negative(dbcsr_scalar_one(data_type))
      row_blk_size => array_data(sm%row_blk_size)
      col_blk_size => array_data(sm%col_blk_size)
      old_dist = dbcsr_distribution(ism)
      target_dist = target_imgdist%i%main
      row_dist => dbcsr_distribution_row_dist(target_dist)
      col_dist => dbcsr_distribution_col_dist(target_dist)
      IF (sm%symmetry) THEN
         IF (SIZE(row_dist) .NE. SIZE(col_dist)) &
            DBCSR_WARN('Unequal row and column distributions for symmetric matrix.')
      END IF
      nrow_images = target_imgdist%i%row_decimation
      IF (nrow_images .GT. 1) THEN
         row_img_dist => array_data(target_imgdist%i%row_image)
      ELSE
         NULLIFY (row_img_dist)
      END IF
      ncol_images = target_imgdist%i%col_decimation
      IF (ncol_images .GT. 1) THEN
         col_img_dist => array_data(target_imgdist%i%col_image)
      ELSE
         NULLIFY (col_img_dist)
      END IF
      mp_obj = dbcsr_distribution_mp(target_dist)
      blacs2mpi => dbcsr_mp_pgrid(mp_obj)
      numproc = dbcsr_mp_numnodes(mp_obj)
      mp_group = dbcsr_mp_group(mp_obj)
      IF (dbcsr_distribution_max_row_dist(old_dist) .GT. UBOUND(blacs2mpi, 1)) &
         DBCSR_ABORT('Row distribution references unexistent processor rows')
      IF (dbg) THEN
         IF (dbcsr_distribution_max_row_dist(old_dist) .NE. UBOUND(blacs2mpi, 1)) &
            DBCSR_WARN('Range of row distribution not equal to processor rows')
      END IF
      IF (dbcsr_distribution_max_col_dist(old_dist) .GT. UBOUND(blacs2mpi, 2)) &
         DBCSR_ABORT('Col distribution references unexistent processor cols')
      IF (dbg) THEN
         IF (dbcsr_distribution_max_col_dist(old_dist) .NE. UBOUND(blacs2mpi, 2)) &
            DBCSR_WARN('Range of col distribution not equal to processor cols')
      END IF

      ! Check threads configuration
!$    IF (.NOT. dbcsr_distribution_has_threads(old_dist)) &
!$       DBCSR_ABORT("Thread distribution not defined")

      ! Allocate shared temporary buffers
      !
      ALLOCATE (send_count(2, nrow_images, ncol_images, 0:numproc - 1)); send_count = 0
      ALLOCATE (recv_count(2, nrow_images, ncol_images, 0:numproc - 1))
      ALLOCATE (total_send_count(2, 0:numproc - 1)); total_send_count = 0
      ALLOCATE (total_recv_count(2, 0:numproc - 1))
      ALLOCATE (sdp(0:numproc - 1))
      ALLOCATE (smp(0:numproc - 1))
      ALLOCATE (sd_disp(0:numproc - 1)); sd_disp(0) = 1
      ALLOCATE (sm_disp(0:numproc - 1)); sm_disp(0) = 1
      ALLOCATE (rd_disp(0:numproc - 1)); rd_disp(0) = 1
      ALLOCATE (rm_disp(0:numproc - 1)); rm_disp(0) = 1
      ALLOCATE (all_total_send_offset(2, 0:numproc - 1))
      ALLOCATE (blk_ps(nrow_images, ncol_images)); blk_ps = 1
      ALLOCATE (blks(nrow_images, ncol_images)); blks = 1
      !
      ! Allocate and init mats
      !
      ALLOCATE (ums%mats(nrow_images, ncol_images))
      data_memory_type = memtype_abpanel_1
      DO row_img = 1, nrow_images
         DO col_img = 1, ncol_images
            ums%mats(row_img, col_img) = dbcsr_type()
            CALL dbcsr_create(ums%mats(row_img, col_img), "imaged "//sm%name, &
                              target_dist, &
                              dbcsr_type_no_symmetry, &
                              row_blk_size_obj=sm%row_blk_size, col_blk_size_obj=sm%col_blk_size, &
                              nze=0, data_type=data_type, &
                              max_rbs=sm%max_rbs, max_cbs=sm%max_cbs, &
                              row_blk_offset=sm%row_blk_offset, col_blk_offset=sm%col_blk_offset, &
                              thread_dist=sm%dist, &
                              data_memory_type=data_memory_type, &
                              index_memory_type=memtype_mpi_buffer)
            ums%mats(row_img, col_img)%negate_real = sm%negate_real
            ums%mats(row_img, col_img)%negate_imaginary = sm%negate_imaginary
         END DO
      END DO

      nthreads = 1
!$OMP PARALLEL DEFAULT (NONE) &
!$OMP PRIVATE (ithread, symmetry_i, row_img, col_img, &
!$OMP          myt_send_count, myt_total_send_count, &
!$OMP          iter, row, col, blk, row_size, col_size, stored_row, stored_col, &
!$OMP          prow, pcol, rowi, coli, vrow, vcol, dst_p, nze, myt_smp, myt_sdp, &
!$OMP          blk_p, bp, sd_pos, sm_pos,tr, &
!$OMP          tr_row_size, tr_col_size) &
!$OMP SHARED (nthreads, nsymmetries, row_img_dist, col_img_dist, &
!$OMP         nrow_images, ncol_images, numproc, scale_value, &
!$OMP         ums, sm, ism, target_imgdist, row_dist, col_dist,&
!$OMP         predist_type_fwd, blacs2mpi, row_blk_size, col_blk_size, &
!$OMP         send_count, recv_count, handle2,mp_group, &
!$OMP         total_send_count, total_recv_count, recv_data_area, nocopy, &
!$OMP         data_type, recv_meta, send_data_area, send_meta, &
!$OMP         sd_disp, sm_disp, rd_disp, rm_disp, all_total_send_offset, blk_ps, blks, &
!$OMP         received_data_area, scale_neg_one, memtype_abpanel_1)
      ithread = 0
!$    ithread = omp_get_thread_num()
!$OMP MASTER
!$    nthreads = omp_get_num_threads()
!$OMP END MASTER

      ! Allocate thread private data
      !
      ALLOCATE (myt_send_count(2, nrow_images, ncol_images, 0:numproc - 1)); myt_send_count(:, :, :, :) = 0
      ALLOCATE (myt_total_send_count(2, 0:numproc - 1))
      ! Thread-local pointers of the current adding position into the send buffers
      ALLOCATE (myt_smp(0:numproc - 1), myt_sdp(0:numproc - 1))

      ! Count sizes for sending
      !
      CALL dbcsr_iterator_start(iter, ism, shared=.TRUE.)
      DO WHILE (dbcsr_iterator_blocks_left(iter))
         CALL dbcsr_iterator_next_block(iter, row, col, blk, &
                                        row_size=row_size, col_size=col_size)
         nze = row_size*col_size
         IF (nze .EQ. 0) CYCLE
         DO symmetry_i = 1, nsymmetries
            IF (symmetry_i .EQ. 1) THEN
               stored_row = row; stored_col = col
            ELSE
               IF (row .EQ. col) CYCLE
               stored_row = col; stored_col = row
            END IF
            ! Where do we send this block?
            row_img = 1
            IF (nrow_images .GT. 1) row_img = row_img_dist(stored_row)
            col_img = 1
            IF (ncol_images .GT. 1) col_img = col_img_dist(stored_col)
            CALL image_calculator(target_imgdist, &
                                  prow=prow, rowi=rowi, &
                                  pcol=pcol, coli=coli, &
                                  vprow=vrow, vpcol=vcol, &
                                  myprow=row_dist(stored_row), myrowi=row_img, &
                                  mypcol=col_dist(stored_col), mycoli=col_img, &
                                  shifting=predist_type_fwd)
            dst_p = blacs2mpi(prow, pcol)
            ! These counts are meant for the thread that processes this row.
            myt_send_count(1, rowi, coli, dst_p) = &
               myt_send_count(1, rowi, coli, dst_p) + 1
            myt_send_count(2, rowi, coli, dst_p) = &
               myt_send_count(2, rowi, coli, dst_p) + nze
         END DO ! symmetry_i
      END DO
      CALL dbcsr_iterator_stop(iter)
      DO dst_p = 0, numproc - 1
         myt_total_send_count(1, dst_p) = SUM(myt_send_count(1, :, :, dst_p))
         myt_total_send_count(2, dst_p) = SUM(myt_send_count(2, :, :, dst_p))
      END DO
      ! Merge the send counts
!$OMP CRITICAL
      send_count(:, :, :, :) = send_count(:, :, :, :) + myt_send_count(:, :, :, :)
      total_send_count(:, :) = total_send_count(:, :) + myt_total_send_count(:, :)
!$OMP END CRITICAL
!$OMP BARRIER
!$OMP MASTER
      CALL timeset(routineN//"_sizes", handle2)
      CALL mp_alltoall(send_count, recv_count, 2*nrow_images*ncol_images, &
                       mp_group)
      CALL timestop(handle2)
!$OMP END MASTER
!$OMP BARRIER
      ! Fill in the meta data structures and copy the data.
!$OMP DO
      DO dst_p = 0, numproc - 1
         total_recv_count(1, dst_p) = SUM(recv_count(1, :, :, dst_p))
         total_recv_count(2, dst_p) = SUM(recv_count(2, :, :, dst_p))
      END DO
!$OMP MASTER
      ! Allocate data structures needed for data exchange.
      CALL dbcsr_data_init(recv_data_area)
      CALL dbcsr_data_init(send_data_area)
      IF (nrow_images .EQ. 1 .AND. ncol_images .EQ. 1 .OR. nocopy) THEN
         ! For some cases the faster dbcsr_special_finalize(reshuffle=.FALSE.) can be used.
         ! This basically makes this working matrix the actual data-area.
         ! Hence, for those cases we have to use data_memory_type already here.
         CALL dbcsr_data_new(recv_data_area, data_type, SUM(total_recv_count(2, :)), memory_type=memtype_abpanel_1)
         ! Some MPI implementations have high overhead when encountering a new buffer.
         ! Therefore we also use the memory pool for the send buffer.
         CALL dbcsr_data_new(send_data_area, data_type, SUM(total_send_count(2, :)), memory_type=memtype_abpanel_1)
      ELSE
         CALL dbcsr_data_new(recv_data_area, data_type, SUM(total_recv_count(2, :)))
         CALL dbcsr_data_new(send_data_area, data_type, SUM(total_send_count(2, :)))
      END IF
      ALLOCATE (recv_meta(metalen*SUM(total_recv_count(1, :))))
      ALLOCATE (send_meta(metalen*SUM(total_send_count(1, :))))
      ! Calculate displacements for processors needed for the exchanges.
      DO dst_p = 1, numproc - 1
         sm_disp(dst_p) = sm_disp(dst_p - 1) &
                          + metalen*total_send_count(1, dst_p - 1)
         sd_disp(dst_p) = sd_disp(dst_p - 1) &
                          + total_send_count(2, dst_p - 1)
         rm_disp(dst_p) = rm_disp(dst_p - 1) &
                          + metalen*total_recv_count(1, dst_p - 1)
         rd_disp(dst_p) = rd_disp(dst_p - 1) &
                          + total_recv_count(2, dst_p - 1)
      END DO
      myt_smp(:) = sm_disp(:)
      myt_sdp(:) = sd_disp(:)
      IF (nthreads .GT. 1) THEN
         all_total_send_offset(1, :) = myt_smp(:) + metalen*myt_total_send_count(1, :)
         all_total_send_offset(2, :) = myt_sdp(:) + myt_total_send_count(2, :)
      END IF
!$OMP END MASTER
!$OMP BARRIER
      IF (ithread .GT. 0) THEN
!$OMP CRITICAL
         myt_smp(:) = all_total_send_offset(1, :)
         myt_sdp(:) = all_total_send_offset(2, :)
         all_total_send_offset(1, :) &
            = all_total_send_offset(1, :) + metalen*myt_total_send_count(1, :)
         all_total_send_offset(2, :) &
            = all_total_send_offset(2, :) + myt_total_send_count(2, :)
!$OMP END CRITICAL
      ELSE
         CALL dbcsr_data_init(received_data_area)
         received_data_area = recv_data_area
         CALL dbcsr_data_hold(received_data_area)
         DO row_img = 1, nrow_images
            DO col_img = 1, ncol_images
               CALL dbcsr_work_create(ums%mats(row_img, col_img), &
                                      SUM(recv_count(1, row_img, col_img, :)), n=1)
               CALL dbcsr_data_hold(received_data_area)
               CALL dbcsr_data_release(ums%mats(row_img, col_img)%wms(1)%data_area)
               ums%mats(row_img, col_img)%wms(1)%data_area = received_data_area
            END DO
         END DO
      END IF
!$OMP BARRIER
      ! Add timing call to the packing of the send buffers
      !
      CALL timeset(routineN//"_pack", handle2)
      ! Copies metadata and actual data to be sent into the send buffers.
      CALL dbcsr_iterator_start(iter, ism, shared=.TRUE.)
      SELECT CASE (data_type)
      CASE (dbcsr_type_real_4)
         CALL prepare_buffers_s(sm%negate_real, sm%negate_imaginary, &
                                iter, row, col, blk, blk_p, bp, &
                                row_size, col_size, nze, nsymmetries, symmetry_i, &
                                stored_row, stored_col, tr_row_size, tr_col_size, tr, &
                                row_img, col_img, nrow_images, ncol_images, &
                                row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, &
                                target_imgdist, prow, pcol, rowi, coli, &
                                row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, &
                                sd_pos, myt_sdp, send_meta, sd_disp, &
                                dbcsr_get_data_p_s(sm%data_area), &
                                send_data_area, scale_neg_one, scale_value)
      CASE (dbcsr_type_real_8)
         CALL prepare_buffers_d(sm%negate_real, sm%negate_imaginary, &
                                iter, row, col, blk, blk_p, bp, &
                                row_size, col_size, nze, nsymmetries, symmetry_i, &
                                stored_row, stored_col, tr_row_size, tr_col_size, tr, &
                                row_img, col_img, nrow_images, ncol_images, &
                                row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, &
                                target_imgdist, prow, pcol, rowi, coli, &
                                row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, &
                                sd_pos, myt_sdp, send_meta, sd_disp, &
                                dbcsr_get_data_p_d(sm%data_area), &
                                send_data_area, scale_neg_one, scale_value)
      CASE (dbcsr_type_complex_4)
         CALL prepare_buffers_c(sm%negate_real, sm%negate_imaginary, &
                                iter, row, col, blk, blk_p, bp, &
                                row_size, col_size, nze, nsymmetries, symmetry_i, &
                                stored_row, stored_col, tr_row_size, tr_col_size, tr, &
                                row_img, col_img, nrow_images, ncol_images, &
                                row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, &
                                target_imgdist, prow, pcol, rowi, coli, &
                                row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, &
                                sd_pos, myt_sdp, send_meta, sd_disp, &
                                dbcsr_get_data_p_c(sm%data_area), &
                                send_data_area, scale_neg_one, scale_value)
      CASE (dbcsr_type_complex_8)
         CALL prepare_buffers_z(sm%negate_real, sm%negate_imaginary, &
                                iter, row, col, blk, blk_p, bp, &
                                row_size, col_size, nze, nsymmetries, symmetry_i, &
                                stored_row, stored_col, tr_row_size, tr_col_size, tr, &
                                row_img, col_img, nrow_images, ncol_images, &
                                row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, &
                                target_imgdist, prow, pcol, rowi, coli, &
                                row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, &
                                sd_pos, myt_sdp, send_meta, sd_disp, &
                                dbcsr_get_data_p_z(sm%data_area), &
                                send_data_area, scale_neg_one, scale_value)
      END SELECT
      CALL dbcsr_iterator_stop(iter)

      ! Deallocate thread private data
      !
      DEALLOCATE (myt_send_count)
      DEALLOCATE (myt_total_send_count)
      DEALLOCATE (myt_smp, myt_sdp)

      CALL timestop(handle2)
!$OMP END PARALLEL

      ! Exchange the data and metadata structures. In the interesting cases (square grids, row col distribution same),
      ! there are only very few processors that need to exchange data.
      ! The hybrid_alltoall deals with this by doing point to point communication
      CALL timeset(routineN//"_data", handle2)
      CALL hybrid_alltoall_any(send_data_area, total_send_count(2, :), sd_disp(:) - 1, &
                               recv_data_area, total_recv_count(2, :), rd_disp(:) - 1, &
                               mp_obj, &
                               most_ptp=.TRUE., remainder_ptp=.TRUE., no_hybrid=.FALSE.)
      CALL hybrid_alltoall_i1( &
         send_meta(:), metalen*total_send_count(1, :), sm_disp(:) - 1, &
         recv_meta(:), metalen*total_recv_count(1, :), rm_disp(:) - 1, &
         most_ptp=.TRUE., remainder_ptp=.TRUE., no_hybrid=.FALSE., &
         mp_env=mp_obj)
      CALL timestop(handle2)

      ! Now create the work index and/or copy the relevant data from the
      ! receive buffer into the local indices.
      !
      DO src_p = 0, numproc - 1
         DO blk_l = 1, total_recv_count(1, src_p)
            stored_row = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1))
            stored_col = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1) + 1)
            stored_blk_p = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1) + 2)
            row_img = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1) + 3)
            col_img = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1) + 4)
            nze = row_blk_size(stored_row)*col_blk_size(stored_col)
            blk = blks(row_img, col_img)
            blks(row_img, col_img) = blks(row_img, col_img) + 1
            blk_ps(row_img, col_img) = blk_ps(row_img, col_img) + nze
            ums%mats(row_img, col_img)%wms(1)%row_i(blk) = stored_row
            ums%mats(row_img, col_img)%wms(1)%col_i(blk) = stored_col
            ums%mats(row_img, col_img)%wms(1)%blk_p(blk) = &
               SIGN(rd_disp(src_p) + ABS(stored_blk_p) - 1, stored_blk_p)
         END DO
      END DO

      ! Finalize the actual imaged matrices from the work matrices
      !
      DO row_img = 1, nrow_images
         DO col_img = 1, ncol_images
            ums%mats(row_img, col_img)%wms(1)%lastblk = blks(row_img, col_img) - 1
            ums%mats(row_img, col_img)%wms(1)%datasize = blk_ps(row_img, col_img) - 1
            !
            CALL dbcsr_data_set_size_referenced( &
               ums%mats(row_img, col_img)%wms(1)%data_area, &
               ums%mats(row_img, col_img)%wms(1)%datasize)
            IF (nrow_images .EQ. 1 .AND. ncol_images .EQ. 1 .OR. nocopy) THEN
               CALL dbcsr_special_finalize(ums%mats(row_img, col_img), reshuffle=.FALSE.)
            ELSE
               CALL dbcsr_special_finalize(ums%mats(row_img, col_img), reshuffle=.TRUE.)
            END IF

            ! Save the home process and image row and column
            CALL image_calculator(target_imgdist, &
                                  ums%mats(row_img, col_img)%index(dbcsr_slot_home_prow), &
                                  ums%mats(row_img, col_img)%index(dbcsr_slot_home_rowi), &
                                  ums%mats(row_img, col_img)%index(dbcsr_slot_home_pcol), &
                                  ums%mats(row_img, col_img)%index(dbcsr_slot_home_coli), &
                                  vprow=ums%mats(row_img, col_img)%index(dbcsr_slot_home_vprow), &
                                  vpcol=ums%mats(row_img, col_img)%index(dbcsr_slot_home_vpcol), &
                                  myrowi=row_img, mycoli=col_img, &
                                  shifting=predist_type)
         END DO
      END DO

      ! Deallocate shared temporary buffers
      !
      DEALLOCATE (send_count, recv_count)
      DEALLOCATE (total_send_count, total_recv_count)
      DEALLOCATE (sdp, smp, sd_disp, sm_disp)
      DEALLOCATE (rd_disp, rm_disp)
      DEALLOCATE (all_total_send_offset)
      DEALLOCATE (blk_ps, blks)
      DEALLOCATE (recv_meta, send_meta)

      CALL dbcsr_data_release(send_data_area)
      CALL dbcsr_data_release(received_data_area)
      CALL dbcsr_data_release(recv_data_area)

      CALL timestop(handle)
   END SUBROUTINE make_images

   SUBROUTINE dbcsr_make_images_dense(images, new_rdist, &
                                      row_map, col_map, join_cols, join_rows, new_template)
      !! Makes dense matrices for the image matrices.
      !! @note Used for making matrices dense/undense

      TYPE(dbcsr_2d_array_type), INTENT(INOUT)           :: images
         !! current (undense) matrix images, output is the dense matrix images
      TYPE(dbcsr_imagedistribution_obj), INTENT(INOUT)   :: new_rdist
         !! the new image distribution for dense matrices
      TYPE(array_i1d_obj), INTENT(IN)                    :: row_map, col_map
         !! mapping of current (undense) rows to dense rows
         !! mapping of current (undense) columns to dense columns
      LOGICAL, INTENT(IN)                                :: join_cols, join_rows
         !! make columns dense, default is yes
         !! make rows dense, default is yes
      TYPE(dbcsr_type), INTENT(IN)                       :: new_template
         !! template dense matrix for creating image matrices

      CHARACTER(len=*), PARAMETER :: routineN = 'dbcsr_make_images_dense'
      LOGICAL, PARAMETER                                 :: dbg = .FALSE.

      INTEGER                                            :: handle, mat_col, mat_row, mat_vpcol, &
                                                            mat_vprow
      INTEGER, DIMENSION(:), CONTIGUOUS, POINTER         :: und_col_blk_offsets, und_row_blk_offsets
      INTEGER, DIMENSION(dbcsr_meta_size)                :: old_meta
      REAL(kind=dp)                                      :: cs
      TYPE(array_i1d_obj)                                :: dense_local_vcols, dense_local_vrows, &
                                                            und_local_vcols, und_local_vrows
      TYPE(dbcsr_imagedistribution_obj)                  :: old_rdist
      TYPE(dbcsr_type)                                   :: tmp_mat

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, handle)
      old_rdist = images%image_dist
      !
      DO mat_row = 1, images%image_dist%i%row_decimation
         DO mat_col = 1, images%image_dist%i%col_decimation
            IF (dbg) THEN
               cs = dbcsr_checksum(images%mats(mat_row, mat_col))
               WRITE (*, *) routineN//" cs pre", cs
            END IF
            mat_vprow = images%mats(mat_row, mat_col)%index(dbcsr_slot_home_vprow)
            mat_vpcol = images%mats(mat_row, mat_col)%index(dbcsr_slot_home_vpcol)
            und_row_blk_offsets => array_data(images%mats(mat_row, mat_col)%row_blk_offset)
            und_col_blk_offsets => array_data(images%mats(mat_row, mat_col)%col_blk_offset)
            CALL dbcsr_get_local_vrows(old_rdist, und_local_vrows, mat_vprow)
            CALL dbcsr_get_local_vcols(old_rdist, und_local_vcols, mat_vpcol)

            CALL dbcsr_get_local_vrows(new_rdist, dense_local_vrows, mat_vprow)
            CALL dbcsr_get_local_vcols(new_rdist, dense_local_vcols, mat_vpcol)
            ! The old matrix has to be remembered so it is copied to
            ! tmp_mat.
            old_meta(:) = images%mats(mat_row, mat_col)%index(1:dbcsr_meta_size)
            tmp_mat = dbcsr_type()
            tmp_mat = images%mats(mat_row, mat_col)
            images%mats(mat_row, mat_col) = dbcsr_type()
            CALL dbcsr_create(images%mats(mat_row, mat_col), template=new_template)
            images%mats(mat_row, mat_col)%index(dbcsr_slot_home_prow &
                                                :dbcsr_slot_home_vpcol) = &
               old_meta(dbcsr_slot_home_prow:dbcsr_slot_home_vpcol)
            CALL dbcsr_make_dense_low(tmp_mat, images%mats(mat_row, mat_col), &
                                      array_data(und_local_vrows), array_data(und_local_vcols), &
                                      und_row_blk_offsets, und_col_blk_offsets, &
                                      array_data(dense_local_vrows), &
                                      array_data(dense_local_vcols), &
                                      array_data(new_template%row_blk_offset), &
                                      array_data(new_template%col_blk_offset), &
                                      array_data(row_map), array_data(col_map), join_rows, join_cols)
            !
            CALL dbcsr_index_prune_deleted(images%mats(mat_row, mat_col))
            !
            CALL dbcsr_release(tmp_mat)
            IF (dbg) THEN
               cs = dbcsr_checksum(images%mats(mat_row, mat_col))
               WRITE (*, *) routineN//" cs pst", cs
            END IF
         END DO
      END DO
      CALL dbcsr_image_dist_release(images%image_dist)
      images%image_dist = new_rdist
      CALL dbcsr_image_dist_hold(images%image_dist)
      CALL timestop(handle)
   END SUBROUTINE dbcsr_make_images_dense

   SUBROUTINE multiply_cannon(left_set, right_set, product_matrix, &
                              retain_sparsity, &
                              filter_eps, flop, keep_product_data)
      !! Multiplies two DBCSR matrices

      TYPE(dbcsr_2d_array_type), POINTER                 :: left_set, right_set
         !! set of imaged left matrices
         !! set of imaged right matrices
      TYPE(dbcsr_type), INTENT(INOUT)                    :: product_matrix
         !! DBCSR product matrix
      LOGICAL, INTENT(IN), OPTIONAL                      :: retain_sparsity
         !! retain the sparsity of the existing product matrix; default is no
      REAL(kind=real_8), INTENT(in), OPTIONAL            :: filter_eps
      INTEGER(KIND=int_8), INTENT(OUT)                   :: flop
         !! effective flop
      LOGICAL, INTENT(IN)                                :: keep_product_data

      CHARACTER(len=*), PARAMETER :: routineN = 'multiply_cannon'
      INTEGER, PARAMETER                                 :: idata = 1, ileft = 0, imeta = 2, &
                                                            iright = 2

      INTEGER :: data_type, data_type_byte, handle, handle1, handle2, handle3, i, ithread, &
                 left_col_image, left_col_mult, left_col_nimages, left_dst_icol, left_dst_irow, &
                 left_dst_p, left_dst_pcol, left_dst_prow, left_dst_vcol, left_dst_vrow, left_max_nblks, &
                 left_max_nze, left_myfirstvcol, left_myfirstvrow, left_mypcol, left_myprow, left_npcols, &
                 left_nprows, left_recv_icol, left_recv_irow, left_recv_p, left_recv_pcol, left_recv_prow, &
                 left_recv_vcol, left_recv_vrow, left_row_image, left_row_mult, left_row_nimages, &
                 left_send_icol, left_send_irow, left_send_p, left_send_pcol, left_send_prow
      INTEGER :: left_send_vcol, left_send_vrow, left_src_icol, left_src_irow, left_src_p, &
                 left_src_pcol, left_src_prow, left_src_vcol, left_src_vrow, metronome, min_nimages, &
                 mynode, nblkrows_used, nsteps_k, nthreads, numnodes, nvirt_k, &
                 output_unit, right_col_image, right_col_mult, right_col_nimages, right_dst_icol, &
                 right_dst_irow, right_dst_p, right_dst_pcol, right_dst_prow, right_dst_vcol, &
                 right_dst_vrow, right_max_nblks, right_max_nze, right_myfirstvcol, right_myfirstvrow, &
                 right_mypcol, right_myprow, right_npcols, right_nprows, right_recv_icol, right_recv_irow
      INTEGER :: right_recv_p, right_recv_pcol, right_recv_prow, right_recv_vcol, right_recv_vrow, &
                 right_row_image, right_row_mult, right_row_nimages, right_send_icol, right_send_irow, &
                 right_send_p, right_send_pcol, right_send_prow, right_send_vcol, right_send_vrow, &
                 right_src_icol, right_src_irow, right_src_p, right_src_pcol, right_src_prow, &
                 right_src_vcol, right_src_vrow, row, size_guess, size_guess_init, stat, threads_finished, &
                 threads_finished_read, v_ki, v_ki_left, v_ki_right
      INTEGER(KIND=int_8)                                :: flop_single, flop_total, mem
      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: row_counts, total_row_counts
      INTEGER, ALLOCATABLE, DIMENSION(:, :, :)           :: left_sizes, my_sizes, right_sizes
      INTEGER, ALLOCATABLE, DIMENSION(:, :, :, :)        :: all_sizes
      INTEGER, DIMENSION(:), POINTER, CONTIGUOUS :: col_blk_sizes2enum, enum2col_blk_sizes, &
                                                    enum2row_blk_sizes, &
                                                    left_index_rp, left_index_sp, m_sizes, n_sizes, &
                                                    right_index_rp, right_index_sp, &
                                                    row_blk_sizes2enum
      INTEGER, DIMENSION(:), POINTER, CONTIGUOUS         :: k_sizes
      INTEGER, DIMENSION(:, :), POINTER, CONTIGUOUS      :: left_pgrid, product_pgrid, right_pgrid
      INTEGER, SAVE                                      :: mult_id = 0
      LOGICAL                                            :: keep_sparsity, list_indexing, &
                                                            otf_filtering

      REAL(kind=sp), ALLOCATABLE, DIMENSION(:) :: left_norms, right_norms, &
                                                  row_max_epss
      REAL(kind=sp)                            :: filter_eps_sp
      TYPE(dbcsr_2d_array_type), POINTER :: left_buffer_2, left_buffer_calc, &
                                            left_buffer_comm, right_buffer_2, right_buffer_calc, right_buffer_comm
      TYPE(dbcsr_data_obj)                     :: left_data_rp, left_data_sp, &
                                                  right_data_rp, right_data_sp
      TYPE(dbcsr_data_obj), POINTER            :: trs_stackbuf_calc, &
                                                  trs_stackbuf_comm
      TYPE(dbcsr_data_obj), TARGET             :: trs_stackbuf_1, trs_stackbuf_2
      TYPE(dbcsr_mm_multrec_type_p), DIMENSION(:), ALLOCATABLE :: multrec
      TYPE(dbcsr_mp_obj)                       :: left_mp_obj, product_mp_obj, &
                                                  right_mp_obj
      TYPE(mp_comm_type)                       :: grp, mp_group
      TYPE(mp_request_type), DIMENSION(:), ALLOCATABLE :: left_data_rr, left_data_sr, left_index_rr, &
                                                         left_index_sr, right_data_rr, right_data_sr, right_index_rr, right_index_sr

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, handle)
      NULLIFY (trs_stackbuf_calc, trs_stackbuf_comm)
      NULLIFY (row_blk_sizes2enum, enum2row_blk_sizes)
      NULLIFY (col_blk_sizes2enum, enum2col_blk_sizes)
      NULLIFY (k_sizes)
      !
      ALLOCATE (left_buffer_2, right_buffer_2)
      mult_id = mult_id + 1

      IF (PRESENT(retain_sparsity)) THEN
         keep_sparsity = retain_sparsity
      ELSE
         keep_sparsity = .FALSE.
      END IF
      otf_filtering = PRESENT(filter_eps)

!$OMP PARALLEL DEFAULT (NONE) &
!$OMP SHARED (multrec, nthreads, product_matrix)
!$OMP MASTER
      nthreads = 1
!$    nthreads = OMP_GET_NUM_THREADS()
      IF (.NOT. ASSOCIATED(product_matrix%wms)) &
         DBCSR_ABORT("Work matrices do not exist")
      IF (SIZE(product_matrix%wms) .NE. nthreads) &
         DBCSR_ABORT("Work matrices not correctly sized.")
      ALLOCATE (multrec(0:nthreads - 1))
!$OMP END MASTER
!$OMP END PARALLEL

      output_unit = default_output_unit
      flop_total = 0
      ! Set up variables
      data_type = dbcsr_get_data_type(product_matrix)
      data_type_byte = dbcsr_datatype_sizeof(data_type)
      left_row_nimages = left_set%image_dist%i%row_decimation
      left_row_mult = left_set%image_dist%i%row_multiplicity
      left_col_nimages = left_set%image_dist%i%col_decimation
      left_col_mult = left_set%image_dist%i%col_multiplicity
      right_row_nimages = right_set%image_dist%i%row_decimation
      right_row_mult = right_set%image_dist%i%row_multiplicity
      right_col_nimages = right_set%image_dist%i%col_decimation
      right_col_mult = right_set%image_dist%i%col_multiplicity
      left_mp_obj = dbcsr_distribution_mp(left_set%image_dist%i%main)
      right_mp_obj = dbcsr_distribution_mp(right_set%image_dist%i%main)
      product_mp_obj = dbcsr_distribution_mp(product_matrix%dist)
      numnodes = dbcsr_mp_numnodes(product_mp_obj)
      mynode = dbcsr_mp_mynode(product_mp_obj)
      left_nprows = dbcsr_mp_nprows(left_mp_obj)
      left_npcols = dbcsr_mp_npcols(left_mp_obj)
      left_myprow = dbcsr_mp_myprow(left_mp_obj)
      left_mypcol = dbcsr_mp_mypcol(left_mp_obj)
      left_myfirstvrow = dbcsr_mp_myprow(left_mp_obj)*left_row_nimages
      left_myfirstvcol = dbcsr_mp_mypcol(left_mp_obj)*left_col_nimages
      right_nprows = dbcsr_mp_nprows(right_mp_obj)
      right_npcols = dbcsr_mp_npcols(right_mp_obj)
      right_myprow = dbcsr_mp_myprow(right_mp_obj)
      right_mypcol = dbcsr_mp_mypcol(right_mp_obj)
      right_myfirstvrow = dbcsr_mp_myprow(right_mp_obj)*right_row_nimages
      right_myfirstvcol = dbcsr_mp_mypcol(right_mp_obj)*right_col_nimages
      mp_group = dbcsr_mp_group(product_mp_obj)
      left_pgrid => dbcsr_mp_pgrid(left_mp_obj)
      right_pgrid => dbcsr_mp_pgrid(right_mp_obj)
      product_pgrid => dbcsr_mp_pgrid(product_mp_obj)
      CALL dbcsr_mp_grid_setup(product_mp_obj)
      CALL dbcsr_mp_grid_setup(left_mp_obj)
      CALL dbcsr_mp_grid_setup(right_mp_obj)
      !
      ! Dummy checks
      ! left/right matching
      IF (left_col_nimages .NE. right_row_mult) &
         DBCSR_ABORT("Left/Right image mismatch")
      IF (left_col_mult .NE. right_row_nimages) &
         DBCSR_ABORT("Left/Right image mismatch")
      IF (left_col_nimages*left_npcols .NE. right_row_nimages*right_nprows) &
         DBCSR_ABORT("Left/Right total mismatch")
      ! product/left matching
      IF (left_row_mult*dbcsr_mp_nprows(product_mp_obj) .NE. left_row_nimages*left_nprows) &
         DBCSR_ABORT("Product/Left total mismatch")
      ! product/left matching
      IF (right_col_mult*dbcsr_mp_npcols(product_mp_obj) .NE. right_col_nimages*right_npcols) &
         DBCSR_ABORT("Product/Right total mismatch")
      ! Limitations
      IF (left_row_nimages .NE. 1) &
         DBCSR_ABORT("Product/Left matrix process grid mismatch")
      IF (left_row_mult .NE. 1) &
         DBCSR_ABORT("Product/Left matrix process grid mismatch")
      IF (right_col_nimages .NE. 1) &
         DBCSR_ABORT("Product/Right matrix process grid mismatch")
      IF (right_col_mult .NE. 1) &
         DBCSR_ABORT("Product/Right matrix process grid mismatch")

      dbcsr_mpi_statistics%nimages = MAX(dbcsr_mpi_statistics%nimages, left_row_nimages*left_col_nimages)
      dbcsr_mpi_statistics%nimages = MAX(dbcsr_mpi_statistics%nimages, right_row_nimages*right_col_nimages)
      !
      ! Exchange size data
      ALLOCATE (my_sizes(4, MAX(left_row_nimages, right_row_nimages), &
                         MAX(left_col_nimages, right_col_nimages)))
      my_sizes(:, :, :) = 0
      DO left_row_image = 1, left_row_nimages
         DO left_col_image = 1, left_col_nimages
            my_sizes(idata + ileft, left_row_image, left_col_image) &
               = dbcsr_data_get_size_referenced( &
                 left_set%mats(left_row_image, left_col_image)%data_area)
            my_sizes(imeta + ileft, left_row_image, left_col_image) = &
               left_set%mats(left_row_image, left_col_image)%index &
               (dbcsr_slot_size)
         END DO
      END DO

      DO right_row_image = 1, right_row_nimages
         DO right_col_image = 1, right_col_nimages
            my_sizes(idata + iright, right_row_image, right_col_image) &
               = dbcsr_data_get_size_referenced( &
                 right_set%mats(right_row_image, right_col_image)%data_area)
            my_sizes(imeta + iright, right_row_image, right_col_image) = &
               right_set%mats(right_row_image, right_col_image)%index &
               (dbcsr_slot_size)
         END DO
      END DO

      ALLOCATE (all_sizes(4, LBOUND(my_sizes, 2):UBOUND(my_sizes, 2), &
                          LBOUND(my_sizes, 3):UBOUND(my_sizes, 3), 0:numnodes - 1))
      CALL mp_allgather(my_sizes, all_sizes, mp_group)
      !
      ! Count the maximum possible multiplies per row for on-the-fly
      ! filtering.
      per_row_eps: IF (.NOT. otf_filtering) THEN
         ! These arrays must be valid when passed to called subroutines.
         ALLOCATE (left_norms(0), right_norms(0), row_max_epss(0), stat=stat)
         IF (stat .NE. 0) &
            DBCSR_ABORT("Could not allocate memory")
      ELSE
         IF (careful_mod) THEN
            IF (left_set%mats(1, 1)%bcsc) &
               DBCSR_ABORT("Can not do on-the-fly filtering with CSC-indexed matrices.")
         END IF
         IF (dbcsr_has_local_row_index(left_set%mats(1, 1))) THEN
            nblkrows_used = dbcsr_nblkrows_local(left_set%mats(1, 1))
         ELSE
            nblkrows_used = dbcsr_nblkrows_total(left_set%mats(1, 1))
         END IF
         ALLOCATE (row_max_epss(nblkrows_used), stat=stat)
         IF (stat .NE. 0) &
            DBCSR_ABORT("Could not allocate memory for left epsilons")
         ALLOCATE (row_counts(nblkrows_used), stat=stat)
         IF (stat .NE. 0) &
            DBCSR_ABORT("Could not allocate memory for left row counts")
         ! The summation could be done prow-locally but it would
         ! complicate the pre-row eps calculation.
         ALLOCATE (total_row_counts(nblkrows_used), stat=stat)
         IF (stat .NE. 0) &
            DBCSR_ABORT("Could not allocate memory for left row counts")
         ! Each prow member matrix (npcols * row_images) counts the
         ! blocks present in each of its rows.
         total_row_counts(:) = 0
         DO left_row_image = 1, left_row_nimages
            DO left_col_image = 1, left_col_nimages
               list_indexing = &
                  left_set%mats(left_row_image, left_col_image)%list_indexing
               IF (careful_mod) THEN
                  IF (list_indexing) THEN
                     IF ((left_set%mats(left_row_image, left_col_image)%nblks)*3 .NE. &
                         SIZE(left_set%mats(left_row_image, left_col_image)%coo_l)) &
                        DBCSR_ABORT("Row count mismatch")
                  ELSE
                     IF (nblkrows_used + 1 .NE. SIZE(left_set%mats(left_row_image, left_col_image)%row_p)) &
                        DBCSR_ABORT("Row count mismatch")
                  END IF
               END IF
               IF (list_indexing) THEN
                  CALL count_bins( &
                     left_set%mats(left_row_image, left_col_image)%nblks, &
                     left_set%mats(left_row_image, left_col_image)%coo_l(1::3), &
                     nblkrows_used, row_counts)
               ELSE
                  CALL dbcsr_count_row_index( &
                     left_set%mats(left_row_image, left_col_image)%row_p, &
                     row_counts, nblkrows_used)
               END IF
               total_row_counts(:) = total_row_counts(:) &
                                     + row_counts(:)
            END DO
         END DO
         ! The counted blocks are then summed up
         CALL mp_sum(total_row_counts, dbcsr_mp_my_row_group(product_mp_obj))
         ! and used to determine the maximum per-block epsilon.
         filter_eps_sp = REAL(filter_eps, KIND=KIND(row_max_epss))
!$OMP PARALLEL DO DEFAULT (NONE) &
!$OMP SHARED(nblkrows_used,row_max_epss,filter_eps_sp,&
!$OMP        total_row_counts)
         DO row = 1, nblkrows_used
            row_max_epss(row) &
               = (filter_eps_sp &
                  /REAL(MAX(1, total_row_counts(row)), KIND=KIND(row_max_epss)))**2
         END DO
!$OMP END PARALLEL DO
         !
         DEALLOCATE (row_counts)
         DEALLOCATE (total_row_counts)
      END IF per_row_eps
      !
      ! The main transfer loop goes through the virtual rows/columns.
      ! The number of steps may be smaller if the grid dimension is very
      ! non-optimal (both left column images and right row images are >
      ! 1).
      min_nimages = MIN(left_col_nimages, right_row_nimages)
      nvirt_k = left_npcols*left_col_nimages
      nsteps_k = nvirt_k/min_nimages
      !
      ! Translate the all_sizes to account for pre-distribution.  This
      ! is just done to simplify lookups.
      ALLOCATE (left_sizes(2, 0:left_nprows*left_row_nimages - 1, 0:nvirt_k - 1))
      left_sizes = -1
      DO left_src_vcol = 0, left_col_nimages*left_npcols - 1
         DO left_src_vrow = 0, left_row_nimages*left_nprows - 1
            ! Calculate what was shifted.  The left_src_v{row,col} are
            ! the "source" rows/columns; the left_dst are the shifted
            ! targets where the data was placed in make_images.
            CALL image_calculator(left_set%image_dist, &
                                  prow=left_dst_prow, pcol=left_dst_pcol, &
                                  rowi=left_dst_irow, coli=left_dst_icol, &
                                  myvprow=left_src_vrow, myvpcol=left_src_vcol, &
                                  shifting='l')
            left_dst_p = left_pgrid(left_dst_prow, left_dst_pcol)
            left_sizes(idata, left_src_vrow, left_src_vcol) = &
               all_sizes( &
               idata + ileft, left_dst_irow, left_dst_icol, left_dst_p)
            left_sizes(imeta, left_src_vrow, left_src_vcol) = &
               all_sizes( &
               imeta + ileft, left_dst_irow, left_dst_icol, left_dst_p)
         END DO
      END DO
      !
      ALLOCATE (right_sizes(2, 0:nvirt_k - 1, 0:right_npcols*right_col_nimages - 1))
      right_sizes = -1
      DO right_src_vcol = 0, right_col_nimages*right_npcols - 1
         DO right_src_vrow = 0, right_row_nimages*right_nprows - 1
            ! Calculate what was shifted.  The right_src_v{row,col} are
            ! the "source" rows/columns; the right_dst are the shifted
            ! targets where the data was placed in make_images.
            CALL image_calculator(right_set%image_dist, &
                                  prow=right_dst_prow, pcol=right_dst_pcol, &
                                  rowi=right_dst_irow, coli=right_dst_icol, &
                                  myvprow=right_src_vrow, myvpcol=right_src_vcol, &
                                  shifting='r')
            right_dst_p = right_pgrid(right_dst_prow, right_dst_pcol)
            right_sizes(idata, right_src_vrow, right_src_vcol) = &
               all_sizes( &
               idata + iright, right_dst_irow, right_dst_icol, right_dst_p)
            right_sizes(imeta, right_src_vrow, right_src_vcol) = &
               all_sizes( &
               imeta + iright, right_dst_irow, right_dst_icol, right_dst_p)
         END DO
      END DO
      !
      ! Setup product work areas
      left_max_nze = MAXVAL(all_sizes(idata + ileft, :, :, :))
      left_max_nblks = MAXVAL(all_sizes(imeta + ileft, :, :, :))
      right_max_nze = MAXVAL(all_sizes(idata + iright, :, :, :))
      right_max_nblks = MAXVAL(all_sizes(imeta + iright, :, :, :))
      !!
      ! Evaluate sizes for workspaces
      IF (.NOT. keep_sparsity) THEN
         IF (has_acc) THEN
            size_guess_init = product_matrix_size_guess(left_set%mats(1, 1), right_set%mats(1, 1), product_matrix, &
                                                        left_max_nze, right_max_nze, &
                                                        left_col_nimages, right_row_nimages, &
                                                        nthreads)
         ELSE
            size_guess_init = 1
         END IF
      END IF
      ithread = 0
!$OMP PARALLEL DEFAULT(NONE) &
!$OMP          PRIVATE (i, size_guess, ithread) &
!$OMP          SHARED (product_matrix, left_max_nze, right_max_nze) &
!$OMP          SHARED (left_set, right_set, &
!$OMP                 left_col_nimages, right_row_nimages) &
!$OMP          SHARED (nthreads, keep_sparsity, mynode, size_guess_init)
      !
!$    ithread = OMP_GET_THREAD_NUM()
      ! The work arrays have to be setup (actually, not quite sure).
      i = ithread + 1
      size_guess = product_matrix%wms(i)%datasize ! Should be minimal
      IF (.NOT. keep_sparsity) THEN
         size_guess = MAX(size_guess, size_guess_init)
      END IF
      CALL dbcsr_data_ensure_size(product_matrix%wms(i)%data_area, &
                                  size_guess)
      CALL dbcsr_data_set_size_referenced(product_matrix%wms(i)%data_area, &
                                          product_matrix%wms(i)%datasize)
      ! XXXXXXX a quick fix right now, allocation with size 1 might actually not be needed at all,
      !         but something expects this to be associated
      CALL ensure_array_size(product_matrix%wms(i)%row_i, ub=1)
      CALL ensure_array_size(product_matrix%wms(i)%col_i, ub=1)
      CALL ensure_array_size(product_matrix%wms(i)%blk_p, ub=1)
!$OMP END PARALLEL

      ! update capacity of memory-pools, +1 for the dense case
      IF (ASSOCIATED(memtype_abpanel_1%pool)) &
         CALL dbcsr_mempool_limit_capacity(memtype_abpanel_1%pool, &
                                           capacity=left_row_mult*left_col_nimages + right_row_nimages*right_col_mult + 1)
      IF (ASSOCIATED(memtype_abpanel_2%pool)) &
         CALL dbcsr_mempool_limit_capacity(memtype_abpanel_2%pool, &
                                           capacity=left_row_mult*left_col_nimages + right_row_nimages*right_col_mult + 1)
      IF (has_acc) THEN
         ! enumerate the blocksizes to keep the following 2D-arrays small.
         CALL enumerate_blk_sizes(right_set%mats(1, 1)%row_blk_size%low%data, &
                                  dbcsr_max_row_size(right_set%mats(1, 1)), &
                                  row_blk_sizes2enum, enum2row_blk_sizes)
         CALL enumerate_blk_sizes(right_set%mats(1, 1)%col_blk_size%low%data, &
                                  dbcsr_max_col_size(right_set%mats(1, 1)), &
                                  col_blk_sizes2enum, enum2col_blk_sizes)
      END IF

      !
      ! Setup the left buffer matrices
      !
      CALL buffer_matrices_ensure_size(left_set, index_size=left_max_nblks, &
                                       data_size=left_max_nze)

      CALL setup_buffer_matrices(left_buffer_2, left_row_mult, left_col_nimages, &
                                 left_set%mats(1, 1), index_size=left_max_nblks, &
                                 data_size=left_max_nze)
      IF (otf_filtering) THEN
         ALLOCATE (left_norms(left_max_nblks), stat=stat)
         IF (stat .NE. 0) &
            DBCSR_ABORT("Could not allocate memory for left norms")
         IF (stat .NE. 0) otf_filtering = .FALSE.
      END IF
      left_buffer_calc => left_set
      left_buffer_comm => left_buffer_2
      ALLOCATE (left_data_sr(left_col_nimages))
      ALLOCATE (left_index_sr(left_col_nimages))
      ALLOCATE (left_data_rr(left_col_nimages))
      ALLOCATE (left_index_rr(left_col_nimages))
      left_data_sr = mp_request_null
      left_data_rr = mp_request_null
      left_index_sr = mp_request_null
      left_index_rr = mp_request_null

      ! Setup buffers for right matrix
      CALL buffer_matrices_ensure_size(right_set, index_size=right_max_nblks, &
                                       data_size=right_max_nze)

      CALL setup_buffer_matrices(right_buffer_2, right_row_nimages, right_col_mult, &
                                 right_set%mats(1, 1), index_size=right_max_nblks, data_size=right_max_nze)
      IF (otf_filtering) THEN
         ALLOCATE (right_norms(right_max_nblks), stat=stat)
         IF (stat .NE. 0) &
            DBCSR_WARN("Could not allocate memory for right norms")
         IF (stat .NE. 0) otf_filtering = .FALSE.
      END IF
      right_buffer_calc => right_set
      right_buffer_comm => right_buffer_2
      ALLOCATE (right_data_sr(right_row_nimages))
      ALLOCATE (right_index_sr(right_row_nimages))
      ALLOCATE (right_data_rr(right_row_nimages))
      ALLOCATE (right_index_rr(right_row_nimages))
      right_data_sr = mp_request_null
      right_data_rr = mp_request_null
      right_index_sr = mp_request_null
      right_index_rr = mp_request_null
      !
      ALLOCATE (m_sizes(dbcsr_nblkrows_local(product_matrix)))
      CALL local_filter(array_data(product_matrix%row_blk_size), array_size(product_matrix%local_rows), &
                        array_data(product_matrix%local_rows), m_sizes)
      ALLOCATE (n_sizes(dbcsr_nblkcols_local(product_matrix)))
      CALL local_filter(array_data(product_matrix%col_blk_size), array_size(product_matrix%local_cols), &
                        array_data(product_matrix%local_cols), n_sizes)
      !
!$OMP PARALLEL &
!$OMP DEFAULT (NONE) &
!$OMP SHARED (left_buffer_comm, right_buffer_comm, product_matrix,&
!$OMP         keep_sparsity, filter_eps, row_max_epss, multrec, nthreads, &
!$OMP         right_data_sr, right_data_rr, left_data_sr, left_data_rr,&
!$OMP         right_index_sr, right_index_rr, left_index_sr, left_index_rr,&
!$OMP         m_sizes, n_sizes, keep_product_data), &
!$OMP PRIVATE(ithread)
      ithread = 0
!$    ithread = OMP_GET_THREAD_NUM()
      ALLOCATE (multrec(ithread)%p)
      CALL dbcsr_mm_multrec_init(multrec(ithread)%p, &
                                 product=product_matrix, &
                                 keep_sparsity=keep_sparsity, &
                                 eps=filter_eps, &
                                 row_max_epss=row_max_epss, &
                                 block_estimate=MAX(product_matrix%nblks, &
                                                    left_buffer_comm%mats(1, 1)%nblks, &
                                                    right_buffer_comm%mats(1, 1)%nblks)/nthreads, &
                                 right_row_blk_size=array_data(right_buffer_comm%mats(1, 1)%row_blk_size), &
                                 m_sizes=m_sizes, n_sizes=n_sizes, &
                                 keep_product_data=keep_product_data)
!$OMP END PARALLEL
      !
      ! Setup indexing
      CALL setup_rec_index_2d(left_set, left_row_nimages, left_col_nimages)
      CALL setup_rec_index_2d(right_set, right_row_nimages, right_col_nimages)
      !
      ! Setup the send/receive data pointers
      CALL dbcsr_data_init(left_data_sp)
      CALL dbcsr_data_init(left_data_rp)
      CALL dbcsr_data_init(right_data_sp)
      CALL dbcsr_data_init(right_data_rp)
      CALL dbcsr_data_new(left_data_sp, data_type)
      CALL dbcsr_data_new(left_data_rp, data_type)
      CALL dbcsr_data_new(right_data_sp, data_type)
      CALL dbcsr_data_new(right_data_rp, data_type)

      ! Setup transpose stackbuffers
      IF (has_acc) THEN
         CALL dbcsr_data_init(trs_stackbuf_1)
         CALL dbcsr_data_init(trs_stackbuf_2)
         CALL dbcsr_data_new(trs_stackbuf_1, data_type=dbcsr_type_int_4, &
                             data_size=2*right_max_nblks, memory_type=memtype_trsbuffer_1)
         CALL dbcsr_data_new(trs_stackbuf_2, data_type=dbcsr_type_int_4, &
                             data_size=2*right_max_nblks, memory_type=memtype_trsbuffer_2)
         trs_stackbuf_calc => trs_stackbuf_1
         trs_stackbuf_comm => trs_stackbuf_2
      END IF
      !
      ! Reset indices for virtual images
      v_ki_right = 0
      v_ki_left = 0
      !
      ! Here is the main loop.
      !
      ! In the first loop iteration, the data is fetched from the
      ! sources. In the remaining iterations, the data are exchanged
      ! among neighbors.  In the last loop only calculations take place.
      !
      CALL timeset(routineN//"_loop", handle1)
      !
      grouped_k_index: DO metronome = 0, nvirt_k - 1
         ! Wait for right matrix transfer completion. Wait in all but
         ! the first loop iteration.
         CALL timeset(routineN//"_metrocomm1", handle2)
         wait_right: IF (v_ki_right .EQ. right_row_nimages) THEN
            ! Reset index
            v_ki_right = 0
            IF (debug_mod) WRITE (*, '(1X,A)') routineN//" waiting for right"
            !
            CALL mp_waitall(right_data_sr)
            CALL mp_waitall(right_data_rr)
            CALL mp_waitall(right_index_sr)
            CALL mp_waitall(right_index_rr)
            !
            ! Repoint indices of right matrices
            DO v_ki = 0, right_row_nimages - 1
               CALL dbcsr_repoint_index(right_buffer_calc%mats(v_ki + 1, 1))
               right_buffer_calc%mats(v_ki + 1, 1)%valid = .TRUE.
            END DO
         END IF wait_right
         CALL timestop(handle2)
         !
         ! Right matrix transfer. Transfer in all but the last loop
         ! iteration.
         xfer_right: IF (v_ki_right .EQ. 0 .AND. metronome + right_row_nimages .LT. nvirt_k) THEN
            DO v_ki = 0, right_row_nimages - 1
               ! Calculate the process to send to.  It's the virtual
               ! process row -min_nimages up (i.e., smaller row number)
               ! from me.
               CALL image_calculator(right_set%image_dist, &
                                     prow=right_send_prow, rowi=right_send_irow, & ! output
                                     pcol=right_send_pcol, coli=right_send_icol, & ! output
                                     vprow=right_send_vrow, vpcol=right_send_vcol, & ! output
                                     ! myvprow goes through all of my (process row) images
                                     myvprow=v_ki + right_myfirstvrow, &
                                     myvpcol=right_myfirstvcol, & ! nothing happens in the columns
                                     vprow_shift=-right_row_nimages, &
                                     shifting='0')
               ! Calculate which data I send.
               CALL image_calculator(right_set%image_dist, &
                                     prow=right_dst_prow, rowi=right_dst_irow, &
                                     pcol=right_dst_pcol, coli=right_dst_icol, &
                                     vprow=right_dst_vrow, vpcol=right_dst_vcol, &
                                     ! myvprows goes through all of my (process row) images
                                     myvprow=v_ki + right_myfirstvrow, &
                                     myvpcol=right_myfirstvcol, & ! nothing happens in the columns
                                     vprow_shift=metronome, &
                                     ! This is with relative shifting.
                                     shifting='R')
               right_dst_p = right_pgrid(right_dst_prow, right_dst_pcol)
               CALL dbcsr_data_set_pointer( &
                  area=right_data_sp, &
                  rsize=right_sizes(idata, right_dst_vrow, right_dst_vcol), &
                  csize=1, &
                  pointee=right_buffer_calc%mats(v_ki + 1, 1)%data_area)
               right_index_sp => right_buffer_calc%mats( &
                                 v_ki + 1, 1 &
                                 )%index(1: &
                                         right_sizes(imeta, right_dst_vrow, right_dst_vcol))
               !
               ! Calculate the process to receive from
               CALL image_calculator(right_set%image_dist, &
                                     prow=right_recv_prow, rowi=right_recv_irow, &
                                     pcol=right_recv_pcol, coli=right_recv_icol, &
                                     vprow=right_recv_vrow, vpcol=right_recv_vcol, &
                                     myvprow=v_ki + right_myfirstvrow, &
                                     myvpcol=right_myfirstvcol, &
                                     vprow_shift=+right_row_nimages, & ! just the opposite as "send to"
                                     shifting='0')
               ! Calculate which data I receive
               CALL image_calculator(right_set%image_dist, &
                                     prow=right_src_prow, rowi=right_src_irow, &
                                     pcol=right_src_pcol, coli=right_src_icol, &
                                     vprow=right_src_vrow, vpcol=right_src_vcol, &
                                     myvprow=v_ki + right_myfirstvrow, &
                                     myvpcol=right_myfirstvcol, &
                                     ! receive window moves with the metronome
                                     vprow_shift=metronome + right_row_nimages, &
                                     shifting='R')
               !
               IF (has_acc) THEN
                  CALL timeset(routineN//"_acc_sync_right", handle3)
                  CALL acc_event_synchronize(right_buffer_comm%mats(v_ki + 1, 1)%data_area%d%acc_ready)
                  CALL timestop(handle3)
               END IF

               right_src_p = right_pgrid(right_src_prow, right_src_pcol)
               CALL dbcsr_data_set_pointer( &
                  area=right_data_rp, &
                  rsize=right_sizes(idata, right_src_vrow, right_src_vcol), &
                  csize=1, &
                  pointee=right_buffer_comm%mats(v_ki + 1, 1)%data_area)
               right_index_rp => right_buffer_comm%mats( &
                                 v_ki + 1, 1 &
                                 )%index(1: &
                                         right_sizes(imeta, right_src_vrow, right_src_vcol))
               !
               right_send_p = right_pgrid(right_send_prow, right_send_pcol)
               right_recv_p = right_pgrid(right_recv_prow, right_recv_pcol)
               ! These are column-communicator relative
               IF (dbcsr_mp_has_subgroups(right_mp_obj)) THEN
                  right_send_p = right_send_prow
                  right_recv_p = right_recv_prow
                  grp = dbcsr_mp_my_col_group(right_mp_obj)
               ELSE
                  grp = dbcsr_mp_group(right_mp_obj)
               END IF
               !
               CALL timeset(routineN//"_metrocomm2", handle2)
               CALL dbcsr_irecv_any(right_data_rp, right_recv_p, &
                                    grp, right_data_rr(v_ki + 1), tag=right_src_vrow)
               CALL mp_irecv(right_index_rp, right_recv_p, &
                             grp, right_index_rr(v_ki + 1), tag=right_src_vrow)
               CALL dbcsr_isend_any(right_data_sp, right_send_p, &
                                    grp, right_data_sr(v_ki + 1), tag=right_dst_vrow)
               CALL mp_isend(right_index_sp, right_send_p, &
                             grp, right_index_sr(v_ki + 1), tag=right_dst_vrow)
               dbcsr_mpi_statistics%nexchanged = dbcsr_mpi_statistics%nexchanged + 1
               CALL count_mpi_statistics(dbcsr_mpi_statistics%data_size(1, :), &
                                         dbcsr_data_get_size(right_data_rp), &
                                         data_type_byte, &
                                         dbcsr_mpi_statistics%data_size_breakdown(:, :, 1))
               CALL timestop(handle2)
            END DO
         END IF xfer_right
         !
         ! Wait for left matrix transfer completion. Wait in all but
         ! the first loop iteration.
         CALL timeset(routineN//"_metrocomm3", handle2)
         wait_left: IF (v_ki_left .EQ. left_col_nimages) THEN
            ! Reset index
            v_ki_left = 0
            IF (debug_mod) WRITE (*, '(1X,A)') routineN//" waiting for left"
            CALL mp_waitall(left_data_sr)
            CALL mp_waitall(left_data_rr)
            CALL mp_waitall(left_index_sr)
            CALL mp_waitall(left_index_rr)
            !
            ! Repoint indices of left matrices
            DO v_ki = 0, left_col_nimages - 1
               CALL dbcsr_repoint_index(left_buffer_calc%mats(1, v_ki + 1))
               left_buffer_calc%mats(1, v_ki + 1)%valid = .TRUE.
            END DO
         END IF wait_left
         CALL timestop(handle2)
         !
         ! Left matrix transfer. Transfer in all but the last processor images.
         xfer_left: IF (v_ki_left .EQ. 0 .AND. metronome + left_col_nimages .LT. nvirt_k) THEN
            DO v_ki = 0, left_col_nimages - 1
               ! Calculate the process to send to.
               CALL image_calculator(left_set%image_dist, &
                                     prow=left_send_prow, rowi=left_send_irow, & ! output
                                     pcol=left_send_pcol, coli=left_send_icol, & ! output
                                     vprow=left_send_vrow, vpcol=left_send_vcol, & ! output
                                     myvprow=left_myfirstvrow, & ! nothing happens in the rows
                                     ! go through all my column images
                                     myvpcol=v_ki + left_myfirstvcol, &
                                     ! send to process left_col_nimages left in the grid
                                     vpcol_shift=-left_col_nimages, &
                                     shifting='0')
               ! Calculate which data I send.
               CALL image_calculator(left_set%image_dist, &
                                     prow=left_dst_prow, rowi=left_dst_irow, &
                                     pcol=left_dst_pcol, coli=left_dst_icol, &
                                     vprow=left_dst_vrow, vpcol=left_dst_vcol, &
                                     myvprow=left_myfirstvrow, &
                                     ! go through all my column images
                                     myvpcol=v_ki + left_myfirstvcol, &
                                     vpcol_shift=metronome, &
                                     ! This is with relative shifting.
                                     shifting='L')
               !
               left_dst_p = left_pgrid(left_dst_prow, left_dst_pcol)
               CALL dbcsr_data_set_pointer( &
                  area=left_data_sp, &
                  rsize=left_sizes(idata, left_dst_vrow, left_dst_vcol), &
                  csize=1, &
                  pointee=left_buffer_calc%mats(1, v_ki + 1)%data_area)
               left_index_sp => left_buffer_calc%mats( &
                                1, v_ki + 1 &
                                )%index(1: &
                                        left_sizes(imeta, left_dst_vrow, left_dst_vcol))
               !
               ! Calculate the process to receive from
               CALL image_calculator(left_set%image_dist, &
                                     prow=left_recv_prow, rowi=left_recv_irow, &
                                     pcol=left_recv_pcol, coli=left_recv_icol, &
                                     vprow=left_recv_vrow, vpcol=left_recv_vcol, &
                                     myvprow=left_myfirstvrow, &
                                     myvpcol=v_ki + left_myfirstvcol, &
                                     vpcol_shift=+left_col_nimages, & ! just the opposite as "send to"
                                     shifting='0')
               ! Calculate which data I receive
               CALL image_calculator(left_set%image_dist, &
                                     prow=left_src_prow, rowi=left_src_irow, &
                                     pcol=left_src_pcol, coli=left_src_icol, &
                                     vprow=left_src_vrow, vpcol=left_src_vcol, &
                                     myvprow=left_myfirstvrow, &
                                     myvpcol=v_ki + left_myfirstvcol, &
                                     ! receive window moves with the metronome
                                     vpcol_shift=metronome + left_col_nimages, &
                                     shifting='L')
               !
               IF (has_acc) THEN
                  CALL timeset(routineN//"_acc_sync_left", handle3)
                  CALL acc_event_synchronize(left_buffer_comm%mats(1, v_ki + 1)%data_area%d%acc_ready)
                  CALL timestop(handle3)
               END IF

               left_src_p = left_pgrid(left_src_prow, left_src_pcol)
               CALL dbcsr_data_set_pointer( &
                  area=left_data_rp, &
                  rsize=left_sizes(idata, left_src_vrow, left_src_vcol), &
                  csize=1, &
                  pointee=left_buffer_comm%mats(1, v_ki + 1)%data_area)
               left_index_rp => left_buffer_comm%mats( &
                                1, v_ki + 1 &
                                )%index(1: &
                                        left_sizes(imeta, left_src_vrow, left_src_vcol))
               !
               left_send_p = left_pgrid(left_send_prow, left_send_pcol)
               left_recv_p = left_pgrid(left_recv_prow, left_recv_pcol)
               ! These are column-communicator relative
               IF (dbcsr_mp_has_subgroups(left_mp_obj)) THEN
                  left_send_p = left_send_pcol
                  left_recv_p = left_recv_pcol
                  grp = dbcsr_mp_my_row_group(left_mp_obj)
               ELSE
                  grp = dbcsr_mp_group(left_mp_obj)
               END IF
               !
               CALL timeset(routineN//"_metrocomm4", handle2)
               CALL dbcsr_irecv_any(left_data_rp, left_recv_p, &
                                    grp, left_data_rr(v_ki + 1), tag=left_src_vcol)
               CALL mp_irecv(left_index_rp, left_recv_p, &
                             grp, left_index_rr(v_ki + 1), tag=left_src_vcol)
               CALL dbcsr_isend_any(left_data_sp, left_send_p, &
                                    grp, left_data_sr(v_ki + 1), tag=left_dst_vcol)
               CALL mp_isend(left_index_sp, left_send_p, &
                             grp, left_index_sr(v_ki + 1), tag=left_dst_vcol)
               dbcsr_mpi_statistics%nexchanged = dbcsr_mpi_statistics%nexchanged + 1
               CALL count_mpi_statistics(dbcsr_mpi_statistics%data_size(2, :), &
                                         dbcsr_data_get_size(left_data_rp), &
                                         data_type_byte, &
                                         dbcsr_mpi_statistics%data_size_breakdown(:, :, 2))
               CALL timestop(handle2)
            END DO
         END IF xfer_left
         !
         ! Do multiplication
         v_ki_left = v_ki_left + 1
         v_ki_right = v_ki_right + 1
         IF (debug_mod) THEN
            CALL dbcsr_print(left_buffer_calc%mats(1, v_ki_left), nodata=.TRUE.)
            CALL dbcsr_print(right_buffer_calc%mats(v_ki_right, 1), nodata=.TRUE.)
         END IF
         !
         ! from here the code for dbcsr_mm_driver_inner_init was taken
         !
         IF (.FALSE.) WRITE (*, *) routineN//" TICK", metronome
         ! Since the right matrix is shifted vertically, the
         ! received data always has different notions of "local
         ! rows".  Thus the local_rows and global_rows must be
         ! recalculated.
         CALL dbcsr_reset_vlocals(right_buffer_calc%mats(v_ki_right, 1), &
                                  right_set%image_dist)
         CALL dbcsr_reset_vlocals(left_buffer_calc%mats(1, v_ki_left), &
                                  left_set%image_dist)
         !
         CALL ensure_array_size(k_sizes, ub=array_size(right_buffer_calc%mats(v_ki_right, 1)%local_rows))
         CALL local_filter(array_data(right_buffer_calc%mats(v_ki_right, 1)%row_blk_size), &
                           array_size(right_buffer_calc%mats(v_ki_right, 1)%local_rows), &
                           array_data(right_buffer_calc%mats(v_ki_right, 1)%local_rows), &
                           k_sizes)
         !
         IF (has_acc) THEN
            CALL dbcsr_data_host2dev(left_buffer_calc%mats(1, v_ki_left)%data_area)
            CALL dbcsr_data_host2dev(right_buffer_calc%mats(v_ki_right, 1)%data_area)
            CALL acc_transpose_blocks(right_buffer_calc%mats(v_ki_right, 1), trs_stackbuf_calc, &
                                      k_sizes, n_sizes, &
                                      row_blk_sizes2enum, enum2row_blk_sizes, &
                                      col_blk_sizes2enum, enum2col_blk_sizes)
         END IF

         ! Sets the local right-matrix columns
         IF (otf_filtering) THEN
            left_norms(:) = huge_norm
            right_norms(:) = huge_norm
            CALL calculate_norms(right_buffer_calc%mats(v_ki_right, 1), &
                                 right_norms, k_sizes, n_sizes)
            CALL calculate_norms(left_buffer_calc%mats(1, v_ki_left), &
                                 left_norms, m_sizes, k_sizes)
         END IF

         ! Wait for left and right buffers transfer to device before proceeding
         IF (has_acc) THEN
            CALL timeset(routineN//"_sync_h2d", handle2)
            CALL acc_device_synchronize()
            CALL timestop(handle2)
         END IF
         !
         flop_single = 0
         threads_finished = 0

!$OMP PARALLEL DEFAULT (NONE) &
!$OMP SHARED (left_buffer_calc, right_buffer_calc, &
!$OMP         v_ki_left, v_ki_right, handle2, handle3, &
!$OMP         product_matrix, multrec,&
!$OMP         filter_eps, right_norms, left_norms, row_max_epss, &
!$OMP         keep_sparsity,threads_finished, &
!$OMP         right_data_sr, right_data_rr, right_index_sr, right_index_rr, &
!$OMP         left_data_sr, left_data_rr, left_index_sr, left_index_rr, &
!$OMP         dbcsr_cfg, k_sizes, nvirt_k, metronome) &
!$OMP PRIVATE (ithread,nthreads,threads_finished_read) &
!$OMP REDUCTION (+: flop_single)
         ithread = 0; nthreads = 1
!$       ithread = omp_get_thread_num(); nthreads = omp_get_num_threads()

         CALL timeset(routineN//"_multrec", handle2)

         CALL dbcsr_mm_multrec_multiply(multrec(ithread)%p, &
                                        left=left_buffer_calc%mats(1, v_ki_left), &
                                        right=right_buffer_calc%mats(v_ki_right, 1), &
                                        flop=flop_single, &
                                        a_norms=left_norms, b_norms=right_norms, &
                                        k_sizes=k_sizes)

         IF (metronome == nvirt_k - 1) THEN
            CALL timeset(routineN//"_multrec_finalize", handle3)
            CALL dbcsr_mm_multrec_finalize(multrec(ithread)%p)
            DEALLOCATE (multrec(ithread)%p)
            CALL timestop(handle3)
         END IF

!$OMP ATOMIC
         threads_finished = threads_finished + 1
         IF (dbcsr_cfg%use_comm_thread%val .AND. (ithread .EQ. 0)) THEN
            DO
! requires OMP 3.1 (e.g. gcc >=4.7), for correctness, otherwise we keep fingers crossed
#if defined _OPENMP && _OPENMP >= 200711
!$OMP                 ATOMIC READ
#endif
               threads_finished_read = threads_finished
               IF (threads_finished_read .EQ. nthreads) EXIT
               CALL mp_testany(right_data_sr)
               CALL mp_testany(right_data_rr)
               CALL mp_testany(left_data_sr)
               CALL mp_testany(left_data_rr)
               CALL mp_testany(right_index_sr)
               CALL mp_testany(right_index_rr)
               CALL mp_testany(left_index_sr)
               CALL mp_testany(left_index_rr)
            END DO
         END IF
!$OMP BARRIER
         CALL timestop(handle2)

!$OMP END PARALLEL
         flop_total = flop_total + flop_single
         !
         ! Move to the next images
         IF (v_ki_left .EQ. left_col_nimages) THEN
            CALL dbcsr_switch(left_buffer_calc, left_buffer_comm)
         END IF
         IF (v_ki_right .EQ. right_row_nimages) THEN
            CALL dbcsr_switch(right_buffer_calc, right_buffer_comm)
            CALL dbcsr_switch(trs_stackbuf_calc, trs_stackbuf_comm)
         END IF

      END DO grouped_k_index
      CALL timestop(handle1)
      CALL m_memory(mem)
      max_memory = MAX(max_memory, REAL(mem))

      IF (has_acc) THEN
         CALL dbcsr_data_release(trs_stackbuf_1)
         CALL dbcsr_data_release(trs_stackbuf_2)
         DEALLOCATE (row_blk_sizes2enum, enum2row_blk_sizes)
         DEALLOCATE (col_blk_sizes2enum, enum2col_blk_sizes)
      END IF

      IF (ALLOCATED(right_norms)) THEN
         DEALLOCATE (right_norms)
      END IF
      IF (ALLOCATED(left_norms)) THEN
         DEALLOCATE (left_norms)
      END IF
      IF (ALLOCATED(row_max_epss)) THEN
         DEALLOCATE (row_max_epss)
      END IF
      !
      CALL dbcsr_destroy_array(right_buffer_2)
      CALL dbcsr_destroy_array(left_buffer_2)
      DEALLOCATE (my_sizes)
      !
      CALL dbcsr_data_clear_pointer(left_data_sp)
      CALL dbcsr_data_clear_pointer(left_data_rp)
      CALL dbcsr_data_clear_pointer(right_data_sp)
      CALL dbcsr_data_clear_pointer(right_data_rp)
      CALL dbcsr_data_release(left_data_sp)
      CALL dbcsr_data_release(left_data_rp)
      CALL dbcsr_data_release(right_data_sp)
      CALL dbcsr_data_release(right_data_rp)
      !
      DEALLOCATE (left_data_rr, left_data_sr, left_index_rr, left_index_sr, &
                  right_data_rr, right_data_sr, right_index_rr, right_index_sr)
      !
      !
      IF (debug_mod) THEN
         v_ki = 0
         DO i = 1, SIZE(product_matrix%blk_p)
            v_ki = MAX(v_ki, ABS(product_matrix%blk_p(i)))
         END DO
         WRITE (*, *) routineN//" Actual final size", &
            LOG(REAL(dbcsr_data_get_size(product_matrix%data_area)))/LOG(10.0), &
            LOG(REAL(v_ki))/LOG(10.0)
      END IF
      !
      flop = flop_total
      DEALLOCATE (left_buffer_2, right_buffer_2)
      DEALLOCATE (m_sizes, n_sizes)
      IF (ASSOCIATED(k_sizes)) DEALLOCATE (k_sizes)
      !
      CALL timestop(handle)
   END SUBROUTINE multiply_cannon

   SUBROUTINE setup_buffer_matrices(buffer_set, buff_nrows, buff_ncols, &
                                    source_matrix, index_size, data_size)
      TYPE(dbcsr_2d_array_type), INTENT(OUT)             :: buffer_set
      INTEGER, INTENT(IN)                                :: buff_nrows, buff_ncols
      TYPE(dbcsr_type), INTENT(IN)                       :: source_matrix
      INTEGER, INTENT(IN)                                :: index_size
      INTEGER, INTENT(IN), OPTIONAL                      :: data_size

      CHARACTER(len=*), PARAMETER :: routineN = 'setup_buffer_matrices'

      INTEGER                                            :: col_image, handle, row_image

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, handle)

      CALL dbcsr_image_dist_init(buffer_set%image_dist)
      ALLOCATE (buffer_set%mats(buff_nrows, buff_ncols))
      DO row_image = 1, buff_nrows
         DO col_image = 1, buff_ncols
            CALL setup_buffer_matrix(buffer_set%mats(row_image, col_image), &
                                     source_matrix, index_size, data_size=data_size, &
                                     data_memory_type=memtype_abpanel_2)
         END DO
      END DO
      IF (source_matrix%local_indexing .AND. careful_mod) THEN
         IF (.NOT. array_exists(source_matrix%local_rows)) &
            DBCSR_ABORT("Local rows should exist.")
         IF (.NOT. array_exists(source_matrix%global_rows)) &
            DBCSR_ABORT("Global rows should exist.")
         !
         IF (.NOT. array_exists(source_matrix%local_cols)) &
            DBCSR_ABORT("Local cols should exist.")
         IF (.NOT. array_exists(source_matrix%global_cols)) &
            DBCSR_ABORT("Global cols should exist.")
      END IF
      CALL timestop(handle)
   END SUBROUTINE setup_buffer_matrices

   SUBROUTINE buffer_matrices_ensure_size(buffer_set, index_size, data_size)
      !! Enlarge left_set and right_set to hold any a/b-panel.
      !! left_set and right_set are created by make_images to hold the a/b-panels
      !! used for the initial cannon-tick. This routine ensures that these buffers
      !! can hold any a/b-panel occurring during a matrix multiply and makes them
      !! therefore suitable as buffers for the entire cannon algorithm.
      !! This saves memory since no separate buffers for the first cannon-tick
      !! have to be allocated.

      TYPE(dbcsr_2d_array_type), INTENT(INOUT)           :: buffer_set
      INTEGER, INTENT(IN)                                :: index_size, data_size

      CHARACTER(len=*), PARAMETER :: routineN = 'buffer_matrices_ensure_size'

      INTEGER                                            :: col_image, handle, row_image

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, handle)

      DO row_image = 1, SIZE(buffer_set%mats, 1)
         DO col_image = 1, SIZE(buffer_set%mats, 2)
            CALL dbcsr_data_ensure_size(buffer_set%mats(row_image, col_image)%data_area, &
                                        data_size)
            CALL ensure_array_size(buffer_set%mats(row_image, col_image)%index, &
                                   ub=index_size, &
                                   memory_type=dbcsr_get_index_memory_type(buffer_set%mats(row_image, col_image)))
            CALL dbcsr_repoint_index(buffer_set%mats(row_image, col_image))
         END DO
      END DO
      CALL timestop(handle)
   END SUBROUTINE buffer_matrices_ensure_size

   SUBROUTINE setup_rec_index_2d(matrix_set, n_rows, n_cols)
      TYPE(dbcsr_2d_array_type), INTENT(INOUT)           :: matrix_set
      INTEGER, INTENT(IN)                                :: n_rows, n_cols

      CHARACTER(len=*), PARAMETER :: routineN = 'setup_rec_index_2d'
      LOGICAL, PARAMETER                                 :: dbg = .FALSE.

      INTEGER                                            :: handle, i_col, i_row, t_f, t_l, t_size

!$    INTEGER                                  :: ithread
      LOGICAL                                  :: thread_redist

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, handle)
      DO i_row = 1, n_rows
         DO i_col = 1, n_cols
            IF (.FALSE.) &
               CALL dbcsr_reset_vlocals(matrix_set%mats(i_row, i_col), &
                                        matrix_set%image_dist)
            IF (dbg) THEN
               WRITE (*, *) routineN//" m, n, size", &
                  SIZE(matrix_set%mats(i_row, i_col)%coo_l), &
                  dbcsr_nblkrows_local(matrix_set%mats(i_row, i_col)), &
                  dbcsr_nblkcols_local(matrix_set%mats(i_row, i_col))
               WRITE (*, '(3(1X,I7))') matrix_set%mats(i_row, i_col)%coo_l
            END IF
            IF (careful_mod) THEN
               IF (SIZE(matrix_set%mats(i_row, i_col)%coo_l) .NE. matrix_set%mats(i_row, i_col)%nblks*3) &
                  DBCSR_ABORT("Block count mismatch.")
            END IF
            thread_redist = ASSOCIATED(matrix_set%mats(i_row, i_col)%thr_c)
            t_size = SIZE(matrix_set%mats(i_row, i_col)%coo_l)/3
            t_f = 1
            t_l = t_size
!$OMP       PARALLEL IF (thread_redist) DEFAULT (NONE) &
!$OMP       PRIVATE (ithread) &
!$OMP       FIRSTPRIVATE (t_f, t_l, t_size) &
!$OMP       SHARED (matrix_set, i_row, i_col, thread_redist)
!$          ithread = OMP_GET_THREAD_NUM()
!$          IF (thread_redist) THEN
!$             t_f = matrix_set%mats(i_row, i_col)%thr_c(ithread + 1) + 1
!$             t_l = matrix_set%mats(i_row, i_col)%thr_c(ithread + 2)
!$          END IF
            t_size = t_l - t_f + 1
!$OMP       BARRIER
            IF (t_size .GT. 0) THEN
               CALL call_rec_sort_index( &
                  dbcsr_nblkrows_local(matrix_set%mats(i_row, i_col)), &
                  dbcsr_nblkcols_local(matrix_set%mats(i_row, i_col)), &
                  t_size, &
                  matrix_set%mats(i_row, i_col)%coo_l((t_f*3 - 2):(t_l*3)))
            END IF
!$OMP       END PARALLEL
         END DO
      END DO
      CALL timestop(handle)
   END SUBROUTINE setup_rec_index_2d

   SUBROUTINE call_rec_sort_index(m, n, nblks, idx)
      !! Used to thunk a call to rec_sort_index
      INTEGER, INTENT(IN)                                :: m, n, nblks
      INTEGER, DIMENSION(3, 1:nblks), INTENT(INOUT)      :: idx

!   ---------------------------------------------------------------------------

      CALL rec_sort_index(1, m, 1, n, nblks, idx, 0)
   END SUBROUTINE call_rec_sort_index

   SUBROUTINE dbcsr_switch_sets(set1p, set2p)
      !! Switches pointers between two matrix sets
      TYPE(dbcsr_2d_array_type), POINTER                 :: set1p, set2p

      TYPE(dbcsr_2d_array_type), POINTER                 :: tmp_set

!   ---------------------------------------------------------------------------

      tmp_set => set1p
      set1p => set2p
      set2p => tmp_set
   END SUBROUTINE dbcsr_switch_sets

   SUBROUTINE dbcsr_switch_d_ptrs(area1p, area2p)
      !! Switches pointers between two data areas
      TYPE(dbcsr_data_obj), POINTER                      :: area1p, area2p

      TYPE(dbcsr_data_obj), POINTER                      :: tmp_p

      tmp_p => area1p
      area1p => area2p
      area2p => tmp_p
   END SUBROUTINE dbcsr_switch_d_ptrs

# 1 "/__w/dbcsr/dbcsr/src/mm/../data/dbcsr.fypp" 1
# 9 "/__w/dbcsr/dbcsr/src/mm/../data/dbcsr.fypp"

# 11 "/__w/dbcsr/dbcsr/src/mm/../data/dbcsr.fypp"

# 169 "/__w/dbcsr/dbcsr/src/mm/../data/dbcsr.fypp"
# 1926 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F" 2
# 1927 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F"

! **************************************************************************************************
      SUBROUTINE prepare_buffers_d (negate_real, negate_imaginary, &
                                                iter, row, col, blk, blk_p, bp, &
                                                row_size, col_size, nze, nsymmetries, symmetry_i, &
                                                stored_row, stored_col, tr_row_size, tr_col_size, tr, &
                                                row_img, col_img, nrow_images, ncol_images, &
                                                row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, &
                                                target_imgdist, prow, pcol, rowi, coli, &
                                                row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, &
                                                sd_pos, myt_sdp, send_meta, sd_disp, &
                                                data_area, send_data_area, scale_neg_one, scale_value)
     !! Prepare buffers for multiplications
         LOGICAL, INTENT(IN)                                     :: negate_real, negate_imaginary
         TYPE(dbcsr_iterator), INTENT(INOUT)                     :: iter
         INTEGER, INTENT(INOUT)                                  :: row, col, blk, blk_p, row_size, col_size, &
                                                                    nze, bp, symmetry_i, &
                                                                    stored_row, stored_col, tr_row_size, tr_col_size, &
                                                                    row_img, col_img, prow, pcol, rowi, coli, &
                                                                    dst_p, sm_pos, sd_pos
         INTEGER, INTENT(IN)                                     :: nsymmetries, nrow_images, ncol_images, metalen
         LOGICAL, INTENT(INOUT)                                  :: tr
         INTEGER, DIMENSION(:), INTENT(IN), CONTIGUOUS, POINTER  :: row_img_dist, col_img_dist, row_dist, col_dist
         INTEGER, DIMENSION(:), ALLOCATABLE, INTENT(IN)          :: sd_disp
         INTEGER, DIMENSION(:), ALLOCATABLE, INTENT(INOUT)       :: myt_smp, myt_sdp, send_meta
         TYPE(dbcsr_imagedistribution_obj), INTENT(IN)           :: target_imgdist
         INTEGER, DIMENSION(:, :), INTENT(IN), CONTIGUOUS, POINTER :: blacs2mpi
         CHARACTER, INTENT(IN)                                   :: predist_type_fwd
         REAL(kind=real_8), DIMENSION(:), INTENT(IN), CONTIGUOUS         :: data_area
         TYPE(dbcsr_data_obj), INTENT(INOUT)                     :: send_data_area
         TYPE(dbcsr_scalar_type), INTENT(IN)                     :: scale_neg_one
         TYPE(dbcsr_scalar_type), INTENT(IN), OPTIONAL           :: scale_value

         DO WHILE (dbcsr_iterator_blocks_left(iter))
            CALL dbcsr_iterator_next_block(iter, row, col, blk, blk_p=blk_p, &
                                           row_size=row_size, col_size=col_size)
            nze = row_size*col_size
            IF (nze .EQ. 0) CYCLE
            bp = ABS(blk_p)
            DO symmetry_i = 1, nsymmetries
               IF (symmetry_i .EQ. 1) THEN
                  stored_row = row; stored_col = col; tr = blk_p .LT. 0
                  tr_row_size = col_size; tr_col_size = row_size
               ELSE
                  IF (row .EQ. col) CYCLE
                  stored_row = col; stored_col = row; tr = blk_p .GT. 0
                  tr_row_size = row_size; tr_col_size = col_size
               END IF
               ! Where do we send this block?
               row_img = 1
               IF (nrow_images .GT. 1) row_img = row_img_dist(stored_row)
               col_img = 1
               IF (ncol_images .GT. 1) col_img = col_img_dist(stored_col)
               CALL image_calculator(target_imgdist, &
                                     prow=prow, rowi=rowi, &
                                     pcol=pcol, coli=coli, &
                                     myprow=row_dist(stored_row), myrowi=row_img, &
                                     mypcol=col_dist(stored_col), mycoli=col_img, &
                                     shifting=predist_type_fwd)
               dst_p = blacs2mpi(prow, pcol)
               sm_pos = myt_smp(dst_p)
               myt_smp(dst_p) = myt_smp(dst_p) + metalen
               sd_pos = myt_sdp(dst_p)
               myt_sdp(dst_p) = myt_sdp(dst_p) + nze
               IF (tr) THEN
                  IF (PRESENT(scale_value)) THEN
                     CALL dbcsr_block_transpose(send_data_area%d%r_dp (sd_pos:myt_sdp(dst_p) - 1), &
                                                data_area(bp:bp + nze - 1)*scale_value%r_dp, &
                                                tr_row_size, tr_col_size)
                  ELSE
                     CALL dbcsr_block_transpose(send_data_area%d%r_dp (sd_pos:myt_sdp(dst_p) - 1), &
                                                data_area(bp:bp + nze - 1), &
                                                tr_row_size, tr_col_size)
                  END IF
                  IF (negate_real .AND. negate_imaginary) THEN
                     send_data_area%d%r_dp (sd_pos:myt_sdp(dst_p) - 1) = &
                        send_data_area%d%r_dp (sd_pos:myt_sdp(dst_p) - 1)*scale_neg_one%r_dp
                  ELSEIF (negate_real) THEN
# 2006 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F"
                        send_data_area%d%r_dp (sd_pos:myt_sdp(dst_p) - 1) = &
                           -send_data_area%d%r_dp (sd_pos:myt_sdp(dst_p) - 1)
# 2015 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F"
                  ELSEIF (negate_imaginary) THEN
# 2020 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F"
                  END IF
               ELSE
                  ! Copy the block
                  IF (PRESENT(scale_value)) THEN
                     send_data_area%d%r_dp (sd_pos:myt_sdp(dst_p) - 1) = &
                        data_area(bp:bp + nze - 1)*scale_value%r_dp
                  ELSE
                     send_data_area%d%r_dp (sd_pos:myt_sdp(dst_p) - 1) = data_area(bp:bp + nze - 1)
                  END IF
               END IF

               send_meta(sm_pos) = stored_row
               send_meta(sm_pos + 1) = stored_col
               send_meta(sm_pos + 2) = sd_pos - sd_disp(dst_p) + 1
               send_meta(sm_pos + 3) = rowi
               send_meta(sm_pos + 4) = coli
            END DO
         END DO
      END SUBROUTINE prepare_buffers_d
# 1927 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F"

! **************************************************************************************************
      SUBROUTINE prepare_buffers_s (negate_real, negate_imaginary, &
                                                iter, row, col, blk, blk_p, bp, &
                                                row_size, col_size, nze, nsymmetries, symmetry_i, &
                                                stored_row, stored_col, tr_row_size, tr_col_size, tr, &
                                                row_img, col_img, nrow_images, ncol_images, &
                                                row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, &
                                                target_imgdist, prow, pcol, rowi, coli, &
                                                row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, &
                                                sd_pos, myt_sdp, send_meta, sd_disp, &
                                                data_area, send_data_area, scale_neg_one, scale_value)
     !! Prepare buffers for multiplications
         LOGICAL, INTENT(IN)                                     :: negate_real, negate_imaginary
         TYPE(dbcsr_iterator), INTENT(INOUT)                     :: iter
         INTEGER, INTENT(INOUT)                                  :: row, col, blk, blk_p, row_size, col_size, &
                                                                    nze, bp, symmetry_i, &
                                                                    stored_row, stored_col, tr_row_size, tr_col_size, &
                                                                    row_img, col_img, prow, pcol, rowi, coli, &
                                                                    dst_p, sm_pos, sd_pos
         INTEGER, INTENT(IN)                                     :: nsymmetries, nrow_images, ncol_images, metalen
         LOGICAL, INTENT(INOUT)                                  :: tr
         INTEGER, DIMENSION(:), INTENT(IN), CONTIGUOUS, POINTER  :: row_img_dist, col_img_dist, row_dist, col_dist
         INTEGER, DIMENSION(:), ALLOCATABLE, INTENT(IN)          :: sd_disp
         INTEGER, DIMENSION(:), ALLOCATABLE, INTENT(INOUT)       :: myt_smp, myt_sdp, send_meta
         TYPE(dbcsr_imagedistribution_obj), INTENT(IN)           :: target_imgdist
         INTEGER, DIMENSION(:, :), INTENT(IN), CONTIGUOUS, POINTER :: blacs2mpi
         CHARACTER, INTENT(IN)                                   :: predist_type_fwd
         REAL(kind=real_4), DIMENSION(:), INTENT(IN), CONTIGUOUS         :: data_area
         TYPE(dbcsr_data_obj), INTENT(INOUT)                     :: send_data_area
         TYPE(dbcsr_scalar_type), INTENT(IN)                     :: scale_neg_one
         TYPE(dbcsr_scalar_type), INTENT(IN), OPTIONAL           :: scale_value

         DO WHILE (dbcsr_iterator_blocks_left(iter))
            CALL dbcsr_iterator_next_block(iter, row, col, blk, blk_p=blk_p, &
                                           row_size=row_size, col_size=col_size)
            nze = row_size*col_size
            IF (nze .EQ. 0) CYCLE
            bp = ABS(blk_p)
            DO symmetry_i = 1, nsymmetries
               IF (symmetry_i .EQ. 1) THEN
                  stored_row = row; stored_col = col; tr = blk_p .LT. 0
                  tr_row_size = col_size; tr_col_size = row_size
               ELSE
                  IF (row .EQ. col) CYCLE
                  stored_row = col; stored_col = row; tr = blk_p .GT. 0
                  tr_row_size = row_size; tr_col_size = col_size
               END IF
               ! Where do we send this block?
               row_img = 1
               IF (nrow_images .GT. 1) row_img = row_img_dist(stored_row)
               col_img = 1
               IF (ncol_images .GT. 1) col_img = col_img_dist(stored_col)
               CALL image_calculator(target_imgdist, &
                                     prow=prow, rowi=rowi, &
                                     pcol=pcol, coli=coli, &
                                     myprow=row_dist(stored_row), myrowi=row_img, &
                                     mypcol=col_dist(stored_col), mycoli=col_img, &
                                     shifting=predist_type_fwd)
               dst_p = blacs2mpi(prow, pcol)
               sm_pos = myt_smp(dst_p)
               myt_smp(dst_p) = myt_smp(dst_p) + metalen
               sd_pos = myt_sdp(dst_p)
               myt_sdp(dst_p) = myt_sdp(dst_p) + nze
               IF (tr) THEN
                  IF (PRESENT(scale_value)) THEN
                     CALL dbcsr_block_transpose(send_data_area%d%r_sp (sd_pos:myt_sdp(dst_p) - 1), &
                                                data_area(bp:bp + nze - 1)*scale_value%r_sp, &
                                                tr_row_size, tr_col_size)
                  ELSE
                     CALL dbcsr_block_transpose(send_data_area%d%r_sp (sd_pos:myt_sdp(dst_p) - 1), &
                                                data_area(bp:bp + nze - 1), &
                                                tr_row_size, tr_col_size)
                  END IF
                  IF (negate_real .AND. negate_imaginary) THEN
                     send_data_area%d%r_sp (sd_pos:myt_sdp(dst_p) - 1) = &
                        send_data_area%d%r_sp (sd_pos:myt_sdp(dst_p) - 1)*scale_neg_one%r_sp
                  ELSEIF (negate_real) THEN
# 2006 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F"
                        send_data_area%d%r_sp (sd_pos:myt_sdp(dst_p) - 1) = &
                           -send_data_area%d%r_sp (sd_pos:myt_sdp(dst_p) - 1)
# 2015 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F"
                  ELSEIF (negate_imaginary) THEN
# 2020 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F"
                  END IF
               ELSE
                  ! Copy the block
                  IF (PRESENT(scale_value)) THEN
                     send_data_area%d%r_sp (sd_pos:myt_sdp(dst_p) - 1) = &
                        data_area(bp:bp + nze - 1)*scale_value%r_sp
                  ELSE
                     send_data_area%d%r_sp (sd_pos:myt_sdp(dst_p) - 1) = data_area(bp:bp + nze - 1)
                  END IF
               END IF

               send_meta(sm_pos) = stored_row
               send_meta(sm_pos + 1) = stored_col
               send_meta(sm_pos + 2) = sd_pos - sd_disp(dst_p) + 1
               send_meta(sm_pos + 3) = rowi
               send_meta(sm_pos + 4) = coli
            END DO
         END DO
      END SUBROUTINE prepare_buffers_s
# 1927 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F"

! **************************************************************************************************
      SUBROUTINE prepare_buffers_z (negate_real, negate_imaginary, &
                                                iter, row, col, blk, blk_p, bp, &
                                                row_size, col_size, nze, nsymmetries, symmetry_i, &
                                                stored_row, stored_col, tr_row_size, tr_col_size, tr, &
                                                row_img, col_img, nrow_images, ncol_images, &
                                                row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, &
                                                target_imgdist, prow, pcol, rowi, coli, &
                                                row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, &
                                                sd_pos, myt_sdp, send_meta, sd_disp, &
                                                data_area, send_data_area, scale_neg_one, scale_value)
     !! Prepare buffers for multiplications
         LOGICAL, INTENT(IN)                                     :: negate_real, negate_imaginary
         TYPE(dbcsr_iterator), INTENT(INOUT)                     :: iter
         INTEGER, INTENT(INOUT)                                  :: row, col, blk, blk_p, row_size, col_size, &
                                                                    nze, bp, symmetry_i, &
                                                                    stored_row, stored_col, tr_row_size, tr_col_size, &
                                                                    row_img, col_img, prow, pcol, rowi, coli, &
                                                                    dst_p, sm_pos, sd_pos
         INTEGER, INTENT(IN)                                     :: nsymmetries, nrow_images, ncol_images, metalen
         LOGICAL, INTENT(INOUT)                                  :: tr
         INTEGER, DIMENSION(:), INTENT(IN), CONTIGUOUS, POINTER  :: row_img_dist, col_img_dist, row_dist, col_dist
         INTEGER, DIMENSION(:), ALLOCATABLE, INTENT(IN)          :: sd_disp
         INTEGER, DIMENSION(:), ALLOCATABLE, INTENT(INOUT)       :: myt_smp, myt_sdp, send_meta
         TYPE(dbcsr_imagedistribution_obj), INTENT(IN)           :: target_imgdist
         INTEGER, DIMENSION(:, :), INTENT(IN), CONTIGUOUS, POINTER :: blacs2mpi
         CHARACTER, INTENT(IN)                                   :: predist_type_fwd
         COMPLEX(kind=real_8), DIMENSION(:), INTENT(IN), CONTIGUOUS         :: data_area
         TYPE(dbcsr_data_obj), INTENT(INOUT)                     :: send_data_area
         TYPE(dbcsr_scalar_type), INTENT(IN)                     :: scale_neg_one
         TYPE(dbcsr_scalar_type), INTENT(IN), OPTIONAL           :: scale_value

         DO WHILE (dbcsr_iterator_blocks_left(iter))
            CALL dbcsr_iterator_next_block(iter, row, col, blk, blk_p=blk_p, &
                                           row_size=row_size, col_size=col_size)
            nze = row_size*col_size
            IF (nze .EQ. 0) CYCLE
            bp = ABS(blk_p)
            DO symmetry_i = 1, nsymmetries
               IF (symmetry_i .EQ. 1) THEN
                  stored_row = row; stored_col = col; tr = blk_p .LT. 0
                  tr_row_size = col_size; tr_col_size = row_size
               ELSE
                  IF (row .EQ. col) CYCLE
                  stored_row = col; stored_col = row; tr = blk_p .GT. 0
                  tr_row_size = row_size; tr_col_size = col_size
               END IF
               ! Where do we send this block?
               row_img = 1
               IF (nrow_images .GT. 1) row_img = row_img_dist(stored_row)
               col_img = 1
               IF (ncol_images .GT. 1) col_img = col_img_dist(stored_col)
               CALL image_calculator(target_imgdist, &
                                     prow=prow, rowi=rowi, &
                                     pcol=pcol, coli=coli, &
                                     myprow=row_dist(stored_row), myrowi=row_img, &
                                     mypcol=col_dist(stored_col), mycoli=col_img, &
                                     shifting=predist_type_fwd)
               dst_p = blacs2mpi(prow, pcol)
               sm_pos = myt_smp(dst_p)
               myt_smp(dst_p) = myt_smp(dst_p) + metalen
               sd_pos = myt_sdp(dst_p)
               myt_sdp(dst_p) = myt_sdp(dst_p) + nze
               IF (tr) THEN
                  IF (PRESENT(scale_value)) THEN
                     CALL dbcsr_block_transpose(send_data_area%d%c_dp (sd_pos:myt_sdp(dst_p) - 1), &
                                                data_area(bp:bp + nze - 1)*scale_value%c_dp, &
                                                tr_row_size, tr_col_size)
                  ELSE
                     CALL dbcsr_block_transpose(send_data_area%d%c_dp (sd_pos:myt_sdp(dst_p) - 1), &
                                                data_area(bp:bp + nze - 1), &
                                                tr_row_size, tr_col_size)
                  END IF
                  IF (negate_real .AND. negate_imaginary) THEN
                     send_data_area%d%c_dp (sd_pos:myt_sdp(dst_p) - 1) = &
                        send_data_area%d%c_dp (sd_pos:myt_sdp(dst_p) - 1)*scale_neg_one%c_dp
                  ELSEIF (negate_real) THEN
# 2009 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F"
                        send_data_area%d%c_dp (sd_pos:myt_sdp(dst_p) - 1) = &
                           CMPLX( &
                           -REAL(send_data_area%d%c_dp (sd_pos:myt_sdp(dst_p) - 1), KIND=real_8), &
                           AIMAG(send_data_area%d%c_dp (sd_pos:myt_sdp(dst_p) - 1)), &
                           KIND=real_8)
# 2015 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F"
                  ELSEIF (negate_imaginary) THEN
# 2017 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F"
                        send_data_area%d%c_dp (sd_pos:myt_sdp(dst_p) - 1) = &
                           CONJG(send_data_area%d%c_dp (sd_pos:myt_sdp(dst_p) - 1))
# 2020 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F"
                  END IF
               ELSE
                  ! Copy the block
                  IF (PRESENT(scale_value)) THEN
                     send_data_area%d%c_dp (sd_pos:myt_sdp(dst_p) - 1) = &
                        data_area(bp:bp + nze - 1)*scale_value%c_dp
                  ELSE
                     send_data_area%d%c_dp (sd_pos:myt_sdp(dst_p) - 1) = data_area(bp:bp + nze - 1)
                  END IF
               END IF

               send_meta(sm_pos) = stored_row
               send_meta(sm_pos + 1) = stored_col
               send_meta(sm_pos + 2) = sd_pos - sd_disp(dst_p) + 1
               send_meta(sm_pos + 3) = rowi
               send_meta(sm_pos + 4) = coli
            END DO
         END DO
      END SUBROUTINE prepare_buffers_z
# 1927 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F"

! **************************************************************************************************
      SUBROUTINE prepare_buffers_c (negate_real, negate_imaginary, &
                                                iter, row, col, blk, blk_p, bp, &
                                                row_size, col_size, nze, nsymmetries, symmetry_i, &
                                                stored_row, stored_col, tr_row_size, tr_col_size, tr, &
                                                row_img, col_img, nrow_images, ncol_images, &
                                                row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, &
                                                target_imgdist, prow, pcol, rowi, coli, &
                                                row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, &
                                                sd_pos, myt_sdp, send_meta, sd_disp, &
                                                data_area, send_data_area, scale_neg_one, scale_value)
     !! Prepare buffers for multiplications
         LOGICAL, INTENT(IN)                                     :: negate_real, negate_imaginary
         TYPE(dbcsr_iterator), INTENT(INOUT)                     :: iter
         INTEGER, INTENT(INOUT)                                  :: row, col, blk, blk_p, row_size, col_size, &
                                                                    nze, bp, symmetry_i, &
                                                                    stored_row, stored_col, tr_row_size, tr_col_size, &
                                                                    row_img, col_img, prow, pcol, rowi, coli, &
                                                                    dst_p, sm_pos, sd_pos
         INTEGER, INTENT(IN)                                     :: nsymmetries, nrow_images, ncol_images, metalen
         LOGICAL, INTENT(INOUT)                                  :: tr
         INTEGER, DIMENSION(:), INTENT(IN), CONTIGUOUS, POINTER  :: row_img_dist, col_img_dist, row_dist, col_dist
         INTEGER, DIMENSION(:), ALLOCATABLE, INTENT(IN)          :: sd_disp
         INTEGER, DIMENSION(:), ALLOCATABLE, INTENT(INOUT)       :: myt_smp, myt_sdp, send_meta
         TYPE(dbcsr_imagedistribution_obj), INTENT(IN)           :: target_imgdist
         INTEGER, DIMENSION(:, :), INTENT(IN), CONTIGUOUS, POINTER :: blacs2mpi
         CHARACTER, INTENT(IN)                                   :: predist_type_fwd
         COMPLEX(kind=real_4), DIMENSION(:), INTENT(IN), CONTIGUOUS         :: data_area
         TYPE(dbcsr_data_obj), INTENT(INOUT)                     :: send_data_area
         TYPE(dbcsr_scalar_type), INTENT(IN)                     :: scale_neg_one
         TYPE(dbcsr_scalar_type), INTENT(IN), OPTIONAL           :: scale_value

         DO WHILE (dbcsr_iterator_blocks_left(iter))
            CALL dbcsr_iterator_next_block(iter, row, col, blk, blk_p=blk_p, &
                                           row_size=row_size, col_size=col_size)
            nze = row_size*col_size
            IF (nze .EQ. 0) CYCLE
            bp = ABS(blk_p)
            DO symmetry_i = 1, nsymmetries
               IF (symmetry_i .EQ. 1) THEN
                  stored_row = row; stored_col = col; tr = blk_p .LT. 0
                  tr_row_size = col_size; tr_col_size = row_size
               ELSE
                  IF (row .EQ. col) CYCLE
                  stored_row = col; stored_col = row; tr = blk_p .GT. 0
                  tr_row_size = row_size; tr_col_size = col_size
               END IF
               ! Where do we send this block?
               row_img = 1
               IF (nrow_images .GT. 1) row_img = row_img_dist(stored_row)
               col_img = 1
               IF (ncol_images .GT. 1) col_img = col_img_dist(stored_col)
               CALL image_calculator(target_imgdist, &
                                     prow=prow, rowi=rowi, &
                                     pcol=pcol, coli=coli, &
                                     myprow=row_dist(stored_row), myrowi=row_img, &
                                     mypcol=col_dist(stored_col), mycoli=col_img, &
                                     shifting=predist_type_fwd)
               dst_p = blacs2mpi(prow, pcol)
               sm_pos = myt_smp(dst_p)
               myt_smp(dst_p) = myt_smp(dst_p) + metalen
               sd_pos = myt_sdp(dst_p)
               myt_sdp(dst_p) = myt_sdp(dst_p) + nze
               IF (tr) THEN
                  IF (PRESENT(scale_value)) THEN
                     CALL dbcsr_block_transpose(send_data_area%d%c_sp (sd_pos:myt_sdp(dst_p) - 1), &
                                                data_area(bp:bp + nze - 1)*scale_value%c_sp, &
                                                tr_row_size, tr_col_size)
                  ELSE
                     CALL dbcsr_block_transpose(send_data_area%d%c_sp (sd_pos:myt_sdp(dst_p) - 1), &
                                                data_area(bp:bp + nze - 1), &
                                                tr_row_size, tr_col_size)
                  END IF
                  IF (negate_real .AND. negate_imaginary) THEN
                     send_data_area%d%c_sp (sd_pos:myt_sdp(dst_p) - 1) = &
                        send_data_area%d%c_sp (sd_pos:myt_sdp(dst_p) - 1)*scale_neg_one%c_sp
                  ELSEIF (negate_real) THEN
# 2009 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F"
                        send_data_area%d%c_sp (sd_pos:myt_sdp(dst_p) - 1) = &
                           CMPLX( &
                           -REAL(send_data_area%d%c_sp (sd_pos:myt_sdp(dst_p) - 1), KIND=real_4), &
                           AIMAG(send_data_area%d%c_sp (sd_pos:myt_sdp(dst_p) - 1)), &
                           KIND=real_4)
# 2015 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F"
                  ELSEIF (negate_imaginary) THEN
# 2017 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F"
                        send_data_area%d%c_sp (sd_pos:myt_sdp(dst_p) - 1) = &
                           CONJG(send_data_area%d%c_sp (sd_pos:myt_sdp(dst_p) - 1))
# 2020 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F"
                  END IF
               ELSE
                  ! Copy the block
                  IF (PRESENT(scale_value)) THEN
                     send_data_area%d%c_sp (sd_pos:myt_sdp(dst_p) - 1) = &
                        data_area(bp:bp + nze - 1)*scale_value%c_sp
                  ELSE
                     send_data_area%d%c_sp (sd_pos:myt_sdp(dst_p) - 1) = data_area(bp:bp + nze - 1)
                  END IF
               END IF

               send_meta(sm_pos) = stored_row
               send_meta(sm_pos + 1) = stored_col
               send_meta(sm_pos + 2) = sd_pos - sd_disp(dst_p) + 1
               send_meta(sm_pos + 3) = rowi
               send_meta(sm_pos + 4) = coli
            END DO
         END DO
      END SUBROUTINE prepare_buffers_c
# 2040 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F"

END MODULE dbcsr_mm_cannon