dbcsr_mm_3d.F Source File


Source Code

# 1 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_3d.F" 1
!--------------------------------------------------------------------------------------------------!
! Copyright (C) by the DBCSR developers group - All rights reserved                                !
! This file is part of the DBCSR library.                                                          !
!                                                                                                  !
! For information on the license, see the LICENSE file.                                            !
! For further information please visit https://dbcsr.cp2k.org                                      !
! SPDX-License-Identifier: GPL-2.0+                                                                !
!--------------------------------------------------------------------------------------------------!

MODULE dbcsr_mm_3d
   !! 3D matrix-matrix multiplication.
   !! <b>Modification history:</b>
   !! - 2016-08    Code organization (Alfio Lazzaro).
   !! - 2017-02    Remove clusters (Alfio Lazzaro).

   USE dbcsr_acc_event, ONLY: acc_event_create, &
                              acc_event_destroy, &
                              acc_event_synchronize
   USE dbcsr_acc_device, ONLY: acc_device_synchronize
   USE dbcsr_array_types, ONLY: array_data, &
                                array_get, &
                                array_size
   USE dbcsr_block_operations, ONLY: dbcsr_block_conjg, &
                                     dbcsr_block_copy_aa, &
                                     dbcsr_block_real_neg, &
                                     dbcsr_block_scale, &
                                     dbcsr_block_transpose_aa, &
                                     dbcsr_data_clear, &
                                     dbcsr_data_set
   USE dbcsr_config, ONLY: dbcsr_cfg, &
                           use_acc
   USE dbcsr_data_methods, ONLY: &
      dbcsr_data_clear_pointer, dbcsr_data_ensure_size, dbcsr_data_exists, &
      dbcsr_data_get_memory_type, dbcsr_data_get_size, dbcsr_data_get_size_referenced, &
      dbcsr_data_get_type, dbcsr_data_host2dev, dbcsr_data_init, dbcsr_data_new, &
      dbcsr_data_release, dbcsr_data_set_pointer, dbcsr_data_set_size_referenced, &
      dbcsr_data_valid, dbcsr_scalar_are_equal, dbcsr_scalar_negative, dbcsr_scalar_one, &
      dbcsr_type_1d_to_2d
   USE dbcsr_data_types, ONLY: dbcsr_datatype_sizeof
   USE dbcsr_dist_methods, ONLY: &
      dbcsr_distribution_col_dist, dbcsr_distribution_has_threads, &
      dbcsr_distribution_local_cols, dbcsr_distribution_local_rows, &
      dbcsr_distribution_max_col_dist, dbcsr_distribution_max_row_dist, dbcsr_distribution_mp, &
      dbcsr_distribution_row_dist, dbcsr_distribution_thread_dist
   USE dbcsr_dist_util, ONLY: find_block_of_element
   USE dbcsr_index_operations, ONLY: dbcsr_repoint_index
   USE dbcsr_iterator_operations, ONLY: dbcsr_iterator_blocks_left, &
                                        dbcsr_iterator_next_block, &
                                        dbcsr_iterator_start, &
                                        dbcsr_iterator_stop
   USE dbcsr_kinds, ONLY: int_8, &
                          real_8, &
                          sp
   USE dbcsr_machine, ONLY: m_memory
   USE dbcsr_mem_methods, ONLY: dbcsr_mempool_limit_capacity
   USE dbcsr_methods, ONLY: &
      dbcsr_col_block_offsets, dbcsr_col_block_sizes, dbcsr_distribution, dbcsr_get_data_type, &
      dbcsr_has_symmetry, dbcsr_max_col_size, dbcsr_max_row_size, dbcsr_nblkcols_local, &
      dbcsr_nblkcols_total, dbcsr_nblkrows_local, dbcsr_nblkrows_total, dbcsr_nfullcols_local, &
      dbcsr_nfullcols_total, dbcsr_nfullrows_local, dbcsr_nfullrows_total, dbcsr_release, &
      dbcsr_row_block_offsets, dbcsr_row_block_sizes, dbcsr_valid_index
   USE dbcsr_mm_common, ONLY: &
      acc_transpose_blocks, calculate_norms, count_mpi_statistics, dbcsr_mm_multrec_type_p, &
      dbcsr_mpi_statistics, enumerate_blk_sizes, local_filter, max_memory, memtype_abpanel_1, &
      memtype_abpanel_2, memtype_mpi_buffer, memtype_mpi_product, memtype_product_wm, &
      memtype_trsbuffer_1, memtype_trsbuffer_2, product_matrix_size_guess, rec_sort_index, &
      setup_buffer_matrix
   USE dbcsr_mm_dist_operations, ONLY: dbcsr_reset_locals, &
                                       dbcsr_reset_vlocals, &
                                       image_calculator
   USE dbcsr_mm_multrec, ONLY: dbcsr_mm_multrec_dev2host_init, &
                               dbcsr_mm_multrec_finalize, &
                               dbcsr_mm_multrec_get_nblks, &
                               dbcsr_mm_multrec_get_nze, &
                               dbcsr_mm_multrec_init, &
                               dbcsr_mm_multrec_multiply, &
                               dbcsr_mm_multrec_red3D
   USE dbcsr_mp_methods, ONLY: &
      dbcsr_mp_grid_setup, dbcsr_mp_group, dbcsr_mp_has_subgroups, dbcsr_mp_my_col_group, &
      dbcsr_mp_my_row_group, dbcsr_mp_mynode, dbcsr_mp_mypcol, dbcsr_mp_myprow, dbcsr_mp_npcols, &
      dbcsr_mp_nprows, dbcsr_mp_numnodes, dbcsr_mp_pgrid
   USE dbcsr_mp_operations, ONLY: dbcsr_isendrecv_any, &
                                  dbcsr_rget_any, &
                                  dbcsr_win_create_any, &
                                  hybrid_alltoall_any, &
                                  hybrid_alltoall_i1
   USE dbcsr_mpiwrap, ONLY: &
      mp_allgather, mp_alltoall, mp_comm_free, mp_comm_null, mp_comm_split_direct, &
      mp_iallgather, mp_isendrecv, mp_isum, mp_request_null, mp_rget, mp_wait, mp_waitall, &
      mp_win_create, mp_win_free, mp_win_lock_all, mp_win_unlock_all, mp_environ, &
      mp_comm_type, mp_request_type, mp_win_type
   USE dbcsr_ptr_util, ONLY: ensure_array_size, &
                             memory_deallocate
   USE dbcsr_types, ONLY: &
      dbcsr_2d_array_obj, dbcsr_data_obj, dbcsr_distribution_obj, dbcsr_imagedistribution_obj, &
      dbcsr_iterator, dbcsr_memtype_type, dbcsr_mp_obj, dbcsr_num_slots, dbcsr_scalar_type, &
      dbcsr_slot_blk_p, dbcsr_slot_col_i, dbcsr_slot_coo_l, dbcsr_slot_dense, &
      dbcsr_slot_home_pcol, dbcsr_slot_home_prow, dbcsr_slot_home_vpcol, dbcsr_slot_home_vprow, &
      dbcsr_slot_nblkrows_total, dbcsr_slot_nblks, dbcsr_slot_nfullcols_local, dbcsr_slot_nze, &
      dbcsr_slot_row_p, dbcsr_slot_size, dbcsr_slot_thr_c, dbcsr_slot_type, dbcsr_type, &
      dbcsr_type_complex_4, dbcsr_type_complex_8, dbcsr_type_int_4, dbcsr_type_no_symmetry, &
      dbcsr_type_real_4, dbcsr_type_real_8
   USE dbcsr_work_operations, ONLY: dbcsr_add_wm_from_matrix, &
                                    dbcsr_create, &
                                    dbcsr_finalize, &
                                    dbcsr_work_create, &
                                    dbcsr_work_destroy
#include "base/dbcsr_base_uses.f90"

!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads, &
!$                    omp_set_lock, omp_unset_lock, omp_init_lock, omp_lock_kind, omp_destroy_lock

   IMPLICIT NONE

   PRIVATE

   CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'dbcsr_mm_3d'
   LOGICAL, PARAMETER :: debug_mod = .FALSE.
   LOGICAL, PARAMETER :: careful_mod = .FALSE.

   TYPE dbcsr_buffer
      TYPE(dbcsr_data_obj)                        :: DATA = dbcsr_data_obj(), &
                                                     data_before_resize = dbcsr_data_obj(), &
                                                     trs_stackbuf = dbcsr_data_obj()
      INTEGER                                     :: vprow = -1, vpcol = -1
      INTEGER                                     :: myproc = -1
      TYPE(mp_comm_type)                          :: grp = mp_comm_null, & ! Global communicator
                                                     subgrp = mp_comm_null ! Communicator for A and B
      TYPE(mp_win_type)                           :: data_win = mp_win_type(), meta_win = mp_win_type()
      INTEGER, DIMENSION(:), POINTER, CONTIGUOUS  :: meta => Null(), &
                                                     meta_before_resize => Null(), &
                                                     meta_red3D => Null()
      TYPE(mp_request_type), DIMENSION(2)         :: get_requests = mp_request_null
      INTEGER                                     :: meta_size = -1
      INTEGER                                     :: num_layers_3D = 1
      INTEGER                                     :: coord3D = 1
      TYPE(dbcsr_type)                            :: matrix = dbcsr_type()
      LOGICAL                                     :: is_valid = .FALSE., &
                                                     is_comm = .FALSE., &
                                                     has_rma_win = .FALSE.
   END TYPE dbcsr_buffer

   TYPE dbcsr_buffers
      TYPE(dbcsr_buffer)                 :: left = dbcsr_buffer(), right = dbcsr_buffer()
   END TYPE dbcsr_buffers

   TYPE mn_local_sizes
      INTEGER, DIMENSION(:), POINTER, CONTIGUOUS :: sizes => Null()
   END TYPE mn_local_sizes

   TYPE dbcsr_layers_3D_C_reduction
      TYPE(mp_comm_type)                 :: grp = mp_comm_null, &
                                            grp3D = mp_comm_null, &
                                            rowgrp3D = mp_comm_null
      INTEGER                            :: num_layers_3D = 1, &
                                            max_num_layers_3D = 1, &
                                            side3D = HUGE(1)
      ! Use a buffer per each thread
      TYPE(dbcsr_data_obj), &
         DIMENSION(:), ALLOCATABLE       :: data_red3D
      INTEGER                            :: data_type = -1
   END TYPE dbcsr_layers_3D_C_reduction

   ! Buffers
   TYPE(dbcsr_buffers), TARGET, SAVE :: buffers_win, &
                                        buffers_1, buffers_2

   INTEGER, PARAMETER, PRIVATE               :: idata = 1, &
                                                imeta = 2, &
                                                ilocal_proc = 1

   TYPE(dbcsr_layers_3D_C_reduction), SAVE :: layers_3D_C_reduction

   LOGICAL, DIMENSION(2), TARGET, SAVE, PRIVATE       :: do_win_create_left, &
                                                         do_win_create_right

   INTEGER, ALLOCATABLE, DIMENSION(:), PRIVATE :: left_total_row_counts

   INTEGER, ALLOCATABLE, DIMENSION(:, :), TARGET, PRIVATE  :: left_local_images_size, &
                                                              right_local_images_size
   INTEGER, DIMENSION(:, :, :, :), POINTER, CONTIGUOUS, PRIVATE :: left_images_size => Null(), &
                                                                   right_images_size => Null()

   INTEGER, ALLOCATABLE, DIMENSION(:), TARGET, PRIVATE :: g2l_map_cols, g2l_map_rows

   TYPE(mp_request_type), PRIVATE               :: request_count_rows
   TYPE(mp_request_type), DIMENSION(2), PRIVATE :: requests
   TYPE(mp_request_type), DIMENSION(2), PRIVATE :: requests_win_create

   TYPE(mp_request_type) :: request_sync_mult = mp_request_null

   ! Buffers used in make_buffers
   TYPE(dbcsr_data_obj), TARGET, SAVE :: make_buffers_data_recv, make_buffers_data_send
   INTEGER, DIMENSION(:), POINTER, CONTIGUOUS :: make_buffers_meta_recv => Null(), &
                                                 make_buffers_meta_send => Null()

   PUBLIC :: multiply_3D
   PUBLIC :: release_layers_3d_C_reduction, buffers_release
   PUBLIC :: dbcsr_make_buffers, make_layers_3d_C_reduction
   PUBLIC :: request_sync_mult
   PUBLIC :: get_max_layers_3D

CONTAINS

   SUBROUTINE dbcsr_make_buffers(matrix, imgdist, is_left, &
      !! Prepare orig images for MPI windows
                                 f_row, l_row, f_col, l_col, &
                                 otf_filtering, alpha)
      TYPE(dbcsr_type), INTENT(IN)                       :: matrix
      TYPE(dbcsr_imagedistribution_obj), INTENT(IN)      :: imgdist
      LOGICAL, INTENT(IN)                                :: is_left
      INTEGER, INTENT(IN)                                :: f_row, l_row, f_col, l_col
      LOGICAL, INTENT(IN)                                :: otf_filtering
      TYPE(dbcsr_scalar_type), INTENT(IN), OPTIONAL      :: alpha

      LOGICAL                                            :: do_scale

      do_scale = .FALSE.
      IF (PRESENT(alpha)) THEN
         IF (.NOT. dbcsr_scalar_are_equal(alpha, dbcsr_scalar_one(alpha%data_type))) THEN
            do_scale = .TRUE.
         END IF
      END IF
      !
      IF (do_scale) THEN
         CALL make_buffers(matrix, imgdist, is_left, &
                           f_row, l_row, f_col, l_col, &
                           otf_filtering, alpha)
      ELSE
         CALL make_buffers(matrix, imgdist, is_left, &
                           f_row, l_row, f_col, l_col, &
                           otf_filtering)
      END IF
   END SUBROUTINE dbcsr_make_buffers

   SUBROUTINE make_buffers(matrix, imgdist, is_left, &
      !! Prepare orig images for MPI windows
                           f_row, l_row, f_col, l_col, &
                           otf_filtering, scale_value)
      TYPE(dbcsr_type), INTENT(IN)                       :: matrix
      TYPE(dbcsr_imagedistribution_obj), INTENT(IN)      :: imgdist
      LOGICAL, INTENT(IN)                                :: is_left
      INTEGER, INTENT(IN)                                :: f_row, l_row, f_col, l_col
      LOGICAL, INTENT(IN)                                :: otf_filtering
      TYPE(dbcsr_scalar_type), INTENT(IN), OPTIONAL      :: scale_value

      CHARACTER(len=*), PARAMETER :: routineN = 'make_buffers'
      INTEGER :: blk, blk_p, bp, col, col_img, col_size, data_type, dst_proc, f_col_f, f_row_f, &
                 handle, handle2, irequests, it, ithread, l_col_l, l_row_l, mynode, myt, &
                 nblkcols_local, nblkrows_local, ncols_images, nimages, nprocs, nprocs_total, &
                 nrows_images, nsymmetries, nthreads, nze, pcol, prow, row, row_img, row_size, size_index, &
                 stored_col, stored_row, symmetry_i, tr_col_size, tr_row_size
      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: img_nblks_cols, img_nblks_rows
      INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: local_images_displ, recv_displ_proc, &
                                                            recv_size_proc, send_displ_proc, &
                                                            send_size_proc
      INTEGER, ALLOCATABLE, DIMENSION(:, :, :)           :: offset_data, offset_threads
      INTEGER, ALLOCATABLE, DIMENSION(:, :, :, :)        :: recv_displs, recv_sizes, send_displs, &
                                                            send_sizes
      INTEGER, DIMENSION(2)                              :: block_col_bounds, block_row_bounds
      INTEGER, DIMENSION(:), POINTER, CONTIGUOUS  :: col_dist, col_img_dist, local_cols, local_g2l_map_cols, &
                                                     local_g2l_map_rows, local_rows, meta_buffer_p, &
                                                     row_dist, row_img_dist, threads_dist
      INTEGER, DIMENSION(:, :), POINTER, CONTIGUOUS      :: blacs2mpi, local_images_size
      INTEGER, DIMENSION(idata:imeta)                    :: my_size_recv, my_size_send
      INTEGER, POINTER                                   :: coli, rowi
      INTEGER, TARGET                                    :: mi, ui
      LOGICAL :: do_crop, do_part_crop_col, do_part_crop_f_col, do_part_crop_f_row, &
                 do_part_crop_l_col, do_part_crop_l_row, do_part_crop_row, do_symmetry, tr
      LOGICAL, DIMENSION(:), POINTER, CONTIGUOUS         :: do_win_create
      TYPE(dbcsr_buffer), POINTER                        :: buffer
      TYPE(dbcsr_data_obj)                               :: data_block
      TYPE(dbcsr_data_obj), POINTER                      :: data_buffer_p
      TYPE(dbcsr_distribution_obj)                       :: set_dist
      TYPE(dbcsr_iterator)                               :: iter
      TYPE(dbcsr_mp_obj)                                 :: mp_obj
      TYPE(dbcsr_scalar_type)                            :: scale_neg_one
      TYPE(dbcsr_type)                                   :: sm
      TYPE(mp_comm_type)                                 :: grp

!$    INTEGER(kind=omp_lock_kind), ALLOCATABLE, DIMENSION(:) :: locks

      CALL timeset(routineN, handle)
      ! Sync with previous multiplication
      IF (request_sync_mult .NE. mp_request_null) &
         DBCSR_ABORT("Multiplications are not in sync!")
      !
      ! Take input values and check validity
      IF (.NOT. dbcsr_valid_index(matrix)) &
         DBCSR_ABORT("Matrix not initialized.")
      sm = matrix
      data_type = sm%data_type
      IF (data_type .NE. dbcsr_type_real_8 .AND. &
          data_type .NE. dbcsr_type_real_4 .AND. &
          data_type .NE. dbcsr_type_complex_8 .AND. &
          data_type .NE. dbcsr_type_complex_4) &
         DBCSR_ABORT("Invalid data type.")
      scale_neg_one = dbcsr_scalar_negative(dbcsr_scalar_one(data_type))
      set_dist = imgdist%i%main
      row_dist => dbcsr_distribution_row_dist(set_dist)
      col_dist => dbcsr_distribution_col_dist(set_dist)
      local_rows => dbcsr_distribution_local_rows(set_dist)
      local_cols => dbcsr_distribution_local_cols(set_dist)
      nblkrows_local = SIZE(local_rows)
      nblkcols_local = SIZE(local_cols)
      IF (sm%symmetry) THEN
         IF (SIZE(row_dist) .NE. SIZE(col_dist)) &
            DBCSR_WARN('Unequal row and column distributions for symmetric matrix.')
      END IF
      nrows_images = imgdist%i%row_decimation
      row_img_dist => array_data(imgdist%i%row_image)
      ncols_images = imgdist%i%col_decimation
      col_img_dist => array_data(imgdist%i%col_image)
      mp_obj = dbcsr_distribution_mp(imgdist%i%main)
      CALL dbcsr_mp_grid_setup(mp_obj)
      nprocs_total = dbcsr_mp_numnodes(mp_obj)
      mynode = dbcsr_mp_mynode(mp_obj)
      grp = dbcsr_mp_group(mp_obj)
      blacs2mpi => dbcsr_mp_pgrid(mp_obj)
      IF (dbcsr_distribution_max_row_dist(set_dist) .GT. UBOUND(blacs2mpi, 1)) &
         DBCSR_ABORT("Row distribution references unexistent processor rows")
      IF (dbcsr_distribution_max_col_dist(set_dist) .GT. UBOUND(blacs2mpi, 2)) &
         DBCSR_ABORT("Col distribution references unexistent processor cols")
      ! Check threads configuration
      NULLIFY (threads_dist)
!$    IF (.NOT. dbcsr_distribution_has_threads(dbcsr_distribution(matrix))) &
!$       DBCSR_ABORT("Thread distribution not defined")
!$    threads_dist => array_data(dbcsr_distribution_thread_dist(dbcsr_distribution(matrix)))
      IF (is_left) THEN
         IF (nrows_images .GT. 1) &
            DBCSR_ABORT("Row nimages for left matrix is not 1!")
      ELSE
         IF (ncols_images .GT. 1) &
            DBCSR_ABORT("Col nimages for right matrix is not 1!")
      END IF
      !
      ! Crop matrix
      do_crop = .FALSE.
      do_part_crop_row = .FALSE.
      do_part_crop_col = .FALSE.
      do_part_crop_f_row = .FALSE.
      do_part_crop_l_row = .FALSE.
      do_part_crop_f_col = .FALSE.
      do_part_crop_l_col = .FALSE.
      ! Set no limits
      IF (ANY((/f_row, l_row, f_col, l_col/) .NE. 0)) THEN
         IF (f_row .LT. 0) &
            DBCSR_ABORT("Invalid first row bound.")
         IF (l_row .GT. dbcsr_nfullrows_total(matrix)) &
            DBCSR_ABORT("Invalid last row bound.")
         IF (f_col .LT. 0) &
            DBCSR_ABORT("Invalid first column bound.")
         IF (l_col .GT. dbcsr_nfullcols_total(matrix)) &
            DBCSR_ABORT("Invalid last column bound.")
         !
         do_crop = .TRUE.
         !
         ! Convert bounds to block addressing
         IF (f_row .EQ. 0) THEN
            block_row_bounds(1) = 1
         ELSE
            CALL find_block_of_element(f_row, block_row_bounds(1), &
                                       dbcsr_nblkrows_total(matrix), &
                                       dbcsr_row_block_offsets(matrix), &
                                       hint=0)
            do_part_crop_f_row = array_get(dbcsr_row_block_offsets(matrix), block_row_bounds(1)) .NE. f_row
            IF (do_part_crop_f_row) THEN
               ! Block offset of last cleared row
               f_row_f = f_row - array_get(dbcsr_row_block_offsets(matrix), block_row_bounds(1))
            END IF
         END IF
         !
         IF (l_row .EQ. 0) THEN
            block_row_bounds(2) = dbcsr_nblkrows_total(matrix)
         ELSE
            CALL find_block_of_element(l_row, block_row_bounds(2), &
                                       dbcsr_nblkrows_total(matrix), &
                                       dbcsr_row_block_offsets(matrix), &
                                       hint=0)
            do_part_crop_l_row = (array_get(dbcsr_row_block_offsets(matrix), block_row_bounds(2) + 1) - 1) .NE. l_row
            IF (do_part_crop_l_row) THEN
               ! Block offset of first cleared row
               l_row_l = 2 + l_row - array_get(dbcsr_row_block_offsets(matrix), block_row_bounds(2))
            END IF
         END IF
         do_part_crop_row = do_part_crop_f_row .OR. do_part_crop_l_row
         !
         IF (f_col .EQ. 0) THEN
            block_col_bounds(1) = 1
         ELSE
            CALL find_block_of_element(f_col, block_col_bounds(1), &
                                       dbcsr_nblkcols_total(matrix), &
                                       dbcsr_col_block_offsets(matrix), &
                                       hint=0)
            do_part_crop_f_col = array_get(dbcsr_col_block_offsets(matrix), block_col_bounds(1)) .NE. f_col
            IF (do_part_crop_f_col) THEN
               ! Block offset of last cleared col
               f_col_f = f_col - array_get(dbcsr_col_block_offsets(matrix), block_col_bounds(1))
            END IF
         END IF
         !
         IF (l_col .EQ. 0) THEN
            block_col_bounds(2) = dbcsr_nblkcols_total(matrix)
         ELSE
            CALL find_block_of_element(l_col, block_col_bounds(2), &
                                       dbcsr_nblkcols_total(matrix), &
                                       dbcsr_col_block_offsets(matrix), &
                                       hint=0)
            do_part_crop_l_col = (array_get(dbcsr_col_block_offsets(matrix), block_col_bounds(2) + 1) - 1) .NE. l_col
            IF (do_part_crop_l_col) THEN
               ! Block offset of first cleared col
               l_col_l = 2 + l_col - array_get(dbcsr_col_block_offsets(matrix), block_col_bounds(2))
            END IF
         END IF
         do_part_crop_col = do_part_crop_f_col .OR. do_part_crop_l_col
      END IF
      !
      IF (dbcsr_has_symmetry(matrix)) THEN
         nsymmetries = 2
         do_symmetry = .TRUE.
      ELSE
         nsymmetries = 1
         do_symmetry = .FALSE.
      END IF
      !
      IF (is_left) THEN
         nimages = ncols_images
         buffer => buffers_win%left
         nprocs = dbcsr_mp_npcols(mp_obj)
         ALLOCATE (left_images_size(idata:imeta, &
                                    nimages, &
                                    MAX(1, dbcsr_mp_nprows(mp_obj)/layers_3D_C_reduction%side3D), &
                                    0:nprocs - 1))
         ALLOCATE (left_local_images_size(idata:imeta, nimages))
         local_images_size => left_local_images_size
         irequests = 1
         !
         ! Count the maximum possible multiplies per row for on-the-fly filtering
         IF (otf_filtering) THEN
            ALLOCATE (left_total_row_counts(nblkrows_local))
            left_total_row_counts = 0
         END IF
         do_win_create => do_win_create_left
      ELSE
         nimages = nrows_images
         buffer => buffers_win%right
         nprocs = dbcsr_mp_nprows(mp_obj)
         ALLOCATE (right_images_size(idata:imeta, &
                                     nimages, &
                                     MAX(1, dbcsr_mp_npcols(mp_obj)/layers_3D_C_reduction%side3D), &
                                     0:nprocs - 1))
         ALLOCATE (right_local_images_size(idata:imeta, nimages))
         local_images_size => right_local_images_size
         irequests = 2
         do_win_create => do_win_create_right
      END IF
      !
      ! 3D communicator
      CALL make_layers_3D_AB(layers_3D_C_reduction%num_layers_3D, &
                             layers_3D_C_reduction%side3D, &
                             mp_obj, is_left, buffer)
      !
      ! Evaluate maps for global -> local indexing (g2l_map_rows, g2l_map_cols)
      ! Count the number of blocks per row/column (img_nblks_rows, img_nblks_cols)
      IF (is_left) THEN
         ALLOCATE (g2l_map_rows(sm%nblkrows_total))
         local_g2l_map_rows => g2l_map_rows
         ALLOCATE (local_g2l_map_cols(sm%nblkcols_total))
         ALLOCATE (img_nblks_rows(1), img_nblks_cols(nimages))
      ELSE
         ALLOCATE (g2l_map_cols(sm%nblkcols_total))
         local_g2l_map_cols => g2l_map_cols
         ALLOCATE (local_g2l_map_rows(sm%nblkrows_total))
         ALLOCATE (img_nblks_rows(nimages), img_nblks_cols(1))
      END IF
      !
      local_g2l_map_rows(:) = 0
      IF (nrows_images .EQ. 1) THEN
         img_nblks_rows(1) = nblkrows_local
         DO row = 1, nblkrows_local
            local_g2l_map_rows(local_rows(row)) = row
         END DO
      ELSE
         img_nblks_rows(:) = 0
         DO row = 1, nblkrows_local
            row_img = row_img_dist(local_rows(row))
            ui = MOD(row_img - 1, nrows_images) + 1
            img_nblks_rows(ui) = img_nblks_rows(ui) + 1
            local_g2l_map_rows(local_rows(row)) = img_nblks_rows(ui)
         END DO
      END IF
      !
      local_g2l_map_cols(:) = 0
      IF (ncols_images .EQ. 1) THEN
         img_nblks_cols(1) = nblkcols_local
         DO col = 1, nblkcols_local
            local_g2l_map_cols(local_cols(col)) = col
         END DO
      ELSE
         img_nblks_cols(:) = 0
         DO col = 1, nblkcols_local
            col_img = col_img_dist(local_cols(col))
            ui = MOD(col_img - 1, ncols_images) + 1
            img_nblks_cols(ui) = img_nblks_cols(ui) + 1
            local_g2l_map_cols(local_cols(col)) = img_nblks_cols(ui)
         END DO
      END IF
      !
!$OMP PARALLEL DEFAULT (NONE) &
!$OMP PRIVATE (ithread,myt,iter,row,col,blk,row_size,col_size,&
!$OMP          stored_row,stored_col,blk_p,bp,tr,&
!$OMP          nze,symmetry_i,row_img,col_img,rowi,coli,&
!$OMP          tr_row_size,tr_col_size,prow,pcol,dst_proc,&
!$OMP          data_buffer_p,meta_buffer_p,&
!$OMP          mi,ui,it,data_block) &
!$OMP SHARED (nthreads,send_sizes,offset_data,matrix,nsymmetries,do_symmetry,&
!$OMP         row_img_dist,col_img_dist,imgdist,row_dist,col_dist,&
!$OMP         is_left,my_size_send,my_size_recv,nimages,&
!$OMP         local_images_size,data_type,memtype_mpi_buffer,sm,&
!$OMP         img_nblks_cols,img_nblks_rows,mynode,offset_threads,&
!$OMP         local_g2l_map_cols,local_g2l_map_rows,recv_sizes,grp,make_buffers_meta_send,&
!$OMP         scale_value,scale_neg_one,make_buffers_data_send,make_buffers_data_recv,&
!$OMP         size_index,recv_displ_proc,recv_size_proc,send_size_proc,&
!$OMP         mp_obj,threads_dist,make_buffers_meta_recv,nrows_images,ncols_images,&
!$OMP         locks,blacs2mpi,send_displ_proc,recv_displs,send_displs,&
!$OMP         left_images_size,right_images_size,local_images_displ,requests,&
!$OMP         buffer,left_total_row_counts,otf_filtering,&
!$OMP         irequests,do_win_create,handle2,nprocs_total,&
!$OMP         do_crop,do_part_crop_row,do_part_crop_col,block_row_bounds,block_col_bounds,&
!$OMP         do_part_crop_f_row,do_part_crop_l_row,do_part_crop_f_col,do_part_crop_l_col,&
!$OMP         f_row_f,l_row_l,f_col_f,l_col_l,requests_win_create)
      ithread = 0
!$    ithread = omp_get_thread_num()
      myt = ithread
      IF (is_left) THEN
         rowi => mi
         coli => ui
      ELSE
         rowi => ui
         coli => mi
      END IF
!$OMP MASTER
      nthreads = 1
!$    nthreads = omp_get_num_threads()
      ALLOCATE (send_sizes(idata:imeta, 0:nthreads - 1, &
                           nimages, 0:nprocs_total - 1))
      send_sizes(:, :, :, :) = 0
      !
      size_index = 0
!$    IF (is_left) THEN
!$       size_index = nthreads + 1
!$    END IF
!$    IF (is_left .AND. do_symmetry) THEN
!$       ALLOCATE (locks(0:nthreads - 1))
!$    END IF
!$OMP END MASTER
!$OMP BARRIER
!$    IF (is_left .AND. do_symmetry) THEN
!$       call omp_init_lock(locks(ithread))
!$    END IF
      !
      ! Take data and meta dimensions per each thread, image, proc
      CALL dbcsr_iterator_start(iter, matrix, shared=.TRUE.)
      DO WHILE (dbcsr_iterator_blocks_left(iter))
         CALL dbcsr_iterator_next_block(iter, row, col, blk, &
                                        row_size=row_size, col_size=col_size)
         nze = row_size*col_size
         IF (nze .EQ. 0) CYCLE
         DO symmetry_i = 1, nsymmetries
            IF (symmetry_i .EQ. 1) THEN
               stored_row = row; stored_col = col
            ELSE
               IF (row .EQ. col) CYCLE
               stored_row = col; stored_col = row
            END IF
            ! Apply cropping
            IF (do_crop) THEN
               IF (stored_row .LT. block_row_bounds(1)) CYCLE
               IF (stored_row .GT. block_row_bounds(2)) CYCLE
               IF (stored_col .LT. block_col_bounds(1)) CYCLE
               IF (stored_col .GT. block_col_bounds(2)) CYCLE
            END IF
            row_img = row_img_dist(stored_row)
            col_img = col_img_dist(stored_col)
            CALL image_calculator(imgdist, &
                                  prow=prow, pcol=pcol, &
                                  rowi=rowi, coli=coli, &
                                  myprow=row_dist(stored_row), myrowi=row_img, &
                                  mypcol=col_dist(stored_col), mycoli=col_img, &
                                  shifting='0')
            dst_proc = blacs2mpi(prow, pcol)
!$          IF (is_left .AND. do_symmetry) THEN
!$             myt = threads_dist(stored_row)
!$          END IF
!$OMP ATOMIC
            send_sizes(imeta, myt, ui, dst_proc) = &
               send_sizes(imeta, myt, ui, dst_proc) + 3
!$OMP ATOMIC
            send_sizes(idata, myt, ui, dst_proc) = &
               send_sizes(idata, myt, ui, dst_proc) + nze
         END DO ! symmetry_i
      END DO
      CALL dbcsr_iterator_stop(iter)
!$OMP BARRIER
!$OMP MASTER
      ! Exchange refs
      ALLOCATE (recv_sizes(idata:imeta, 0:nthreads - 1, &
                           nimages, 0:nprocs_total - 1))
      CALL timeset(routineN//"_sizes", handle2)
      CALL mp_alltoall(send_sizes(:, :, :, :), &
                       recv_sizes(:, :, :, :), &
                       2*nimages*nthreads, grp)
      CALL timestop(handle2)
      !
      ! Evaluate the local size for each image, accumulating over threads and procs.
      ! Take the local displacement for each image.
      ! Note that displacement starts at zero.
      my_size_recv(:) = 0
      local_images_size(:, :) = 0
      ALLOCATE (local_images_displ(idata:imeta, nimages))
      DO ui = 1, nimages
         local_images_displ(:, ui) = my_size_recv(:)
         DO dst_proc = 0, nprocs_total - 1
            DO it = 0, nthreads - 1
               local_images_size(:, ui) = local_images_size(:, ui) + &
                                          recv_sizes(:, it, ui, dst_proc)
            END DO
         END DO
         IF (local_images_size(imeta, ui) .EQ. 0) CYCLE
         ! Include stats slots for threads indices
         local_images_size(imeta, ui) = local_images_size(imeta, ui) + size_index
         my_size_recv(:) = my_size_recv(:) + local_images_size(:, ui)
      END DO
      !
      ! Exchange sizes
      IF (is_left) THEN
         CALL mp_iallgather(local_images_size, left_images_size, buffer%subgrp, requests(irequests))
      ELSE
         CALL mp_iallgather(local_images_size, right_images_size, buffer%subgrp, requests(irequests))
      END IF
      !
      ! Allocate data and meta buffers
      do_win_create(:) = .NOT. buffer%has_rma_win
      IF (buffer%has_rma_win) THEN
         IF (buffer%grp .NE. grp .OR. dbcsr_data_get_type(buffer%data) .NE. data_type) THEN
            do_win_create(:) = .TRUE.
         END IF
      END IF
      CALL buffer_init(buffer, data_type, &
                       my_size_recv(idata), my_size_recv(imeta), &
                       data_memory_type=memtype_mpi_buffer)
      buffer%grp = grp
      !
      ! Set send and recv buffers sizes and displacements for each proc.
      ! Accumulate over images and threads.
      ! Here displacement starts at one.
      ALLOCATE (send_displs(idata:imeta, 0:nthreads - 1, &
                            nimages, 0:nprocs_total - 1))
      ! Displs for local data arrangement, starting at one.
      ALLOCATE (recv_displs(idata:imeta, 0:nthreads - 1, &
                            nimages, 0:nprocs_total - 1))
      ! Here displacement starts at zero.
      ALLOCATE (send_size_proc(idata:imeta, 0:nprocs_total - 1))
      ALLOCATE (recv_size_proc(idata:imeta, 0:nprocs_total - 1))
      ALLOCATE (send_displ_proc(idata:imeta, 0:nprocs_total - 1))
      ALLOCATE (recv_displ_proc(idata:imeta, 0:nprocs_total - 1))
      my_size_send(:) = 1
      my_size_recv(:) = 1
      DO dst_proc = 0, nprocs_total - 1
         send_displ_proc(:, dst_proc) = my_size_send(:) - 1
         recv_displ_proc(:, dst_proc) = my_size_recv(:) - 1
         ! Avoid communication of local data
         IF (dst_proc .NE. mynode) THEN
            DO ui = 1, nimages
               DO it = 0, nthreads - 1
                  send_displs(:, it, ui, dst_proc) = my_size_send(:)
                  recv_displs(:, it, ui, dst_proc) = my_size_recv(:)
                  my_size_send(:) = my_size_send(:) + send_sizes(:, it, ui, dst_proc)
                  my_size_recv(:) = my_size_recv(:) + recv_sizes(:, it, ui, dst_proc)
               END DO
            END DO
         ELSE
            ! Reset all
            send_displs(:, :, :, dst_proc) = 0
            recv_displs(:, :, :, dst_proc) = 0
         END IF
         send_size_proc(:, dst_proc) = my_size_send(:) - send_displ_proc(:, dst_proc) - 1
         recv_size_proc(:, dst_proc) = my_size_recv(:) - recv_displ_proc(:, dst_proc) - 1
      END DO
      !
      ! Allocate data/meta to send
      IF (dbcsr_data_valid(make_buffers_data_send)) THEN
         IF (dbcsr_data_get_type(make_buffers_data_send) .NE. data_type) THEN
            CALL dbcsr_data_release(make_buffers_data_send)
         END IF
      END IF
      IF (dbcsr_data_valid(make_buffers_data_send)) THEN
         CALL dbcsr_data_ensure_size(make_buffers_data_send, my_size_send(idata) - 1, nocopy=.TRUE.)
      ELSE
         CALL dbcsr_data_init(make_buffers_data_send)
         CALL dbcsr_data_new(make_buffers_data_send, data_type, my_size_send(idata) - 1, &
                             memory_type=memtype_mpi_buffer)
      END IF
      CALL ensure_array_size(make_buffers_meta_send, ub=my_size_send(imeta) - 1, &
                             nocopy=.TRUE., memory_type=memtype_mpi_buffer)
      ! Displs for data offset
      ALLOCATE (offset_threads(idata:imeta, 0:nthreads - 1, nimages))
      offset_threads(:, :, :) = 0
      ! Set offset for local data
      ALLOCATE (offset_data(0:nthreads - 1, nimages, 0:nprocs_total - 1))
      offset_data(:, :, :) = 1
      ! Evaluate local displs
      DO ui = 1, nimages
         IF (local_images_size(imeta, ui) .EQ. 0) CYCLE
         offset_threads(:, 0, ui) = 0
         DO it = 1, nthreads - 1
            offset_threads(:, it, ui) = offset_threads(:, it - 1, ui)
            DO dst_proc = 0, nprocs_total - 1
               offset_threads(:, it, ui) = offset_threads(:, it, ui) + &
                                           recv_sizes(:, it - 1, ui, dst_proc)
            END DO
         END DO
         ! Fill meta indices for threads
!$       IF (is_left) THEN
!$          buffer%meta(local_images_displ(imeta, ui) + 1:local_images_displ(imeta, ui) + nthreads) = &
!$             offset_threads(imeta, :, ui)/3
!$          buffer%meta(local_images_displ(imeta, ui) + size_index) = &
!$             (local_images_size(imeta, ui) - size_index)/3
!$       END IF
         offset_threads(imeta, :, ui) = offset_threads(imeta, :, ui) + local_images_displ(imeta, ui) + size_index + 1
         send_displs(:, :, ui, mynode) = offset_threads(:, :, ui)
         !
         ! Allow ordering by proc for insertion
         DO dst_proc = 0, mynode - 1
            DO it = 0, nthreads - 1
               send_displs(:, it, ui, mynode) = send_displs(:, it, ui, mynode) + &
                                                recv_sizes(:, it, ui, dst_proc)
            END DO
         END DO
         offset_data(:, ui, mynode) = send_displs(idata, :, ui, mynode) + 1
         send_displs(idata, :, ui, mynode) = send_displs(idata, :, ui, mynode) + local_images_displ(idata, ui) + 1
      END DO
!$OMP END MASTER
!$OMP BARRIER
      !
      IF (do_part_crop_row .OR. do_part_crop_col) THEN
         CALL dbcsr_data_init(data_block)
         CALL dbcsr_data_new(data_block, dbcsr_type_1d_to_2d(data_type))
      END IF
      !
      ! Copy data and meta in the buffers
      CALL timeset(routineN//"_pack", handle2)
      CALL dbcsr_iterator_start(iter, matrix, shared=.TRUE.)
      DO WHILE (dbcsr_iterator_blocks_left(iter))
         CALL dbcsr_iterator_next_block(iter, row, col, blk, blk_p=blk_p, &
                                        row_size=row_size, col_size=col_size)
         nze = row_size*col_size
         IF (nze .EQ. 0) CYCLE
         bp = ABS(blk_p)
         DO symmetry_i = 1, nsymmetries
            IF (symmetry_i .EQ. 1) THEN
               stored_row = row; stored_col = col; tr = blk_p .LT. 0
               tr_row_size = col_size; tr_col_size = row_size
            ELSE
               IF (row .EQ. col) CYCLE
               stored_row = col; stored_col = row; tr = blk_p .GT. 0
               tr_row_size = row_size; tr_col_size = col_size
            END IF
            ! Apply cropping
            IF (do_crop) THEN
               IF (stored_row .LT. block_row_bounds(1)) CYCLE
               IF (stored_row .GT. block_row_bounds(2)) CYCLE
               IF (stored_col .LT. block_col_bounds(1)) CYCLE
               IF (stored_col .GT. block_col_bounds(2)) CYCLE
            END IF
            row_img = row_img_dist(stored_row)
            col_img = col_img_dist(stored_col)
            CALL image_calculator(imgdist, &
                                  prow=prow, pcol=pcol, &
                                  rowi=rowi, coli=coli, &
                                  myprow=row_dist(stored_row), myrowi=row_img, &
                                  mypcol=col_dist(stored_col), mycoli=col_img, &
                                  shifting='0')
            dst_proc = blacs2mpi(prow, pcol)
            IF (dst_proc .EQ. mynode) THEN
               data_buffer_p => buffer%data
               meta_buffer_p => buffer%meta
            ELSE
               data_buffer_p => make_buffers_data_send
               meta_buffer_p => make_buffers_meta_send
            END IF
!$          IF (is_left .AND. do_symmetry) THEN
!$             myt = threads_dist(stored_row)
!$             call omp_set_lock(locks(myt))
!$          END IF
            IF (tr) THEN
               CALL dbcsr_block_transpose_aa(data_buffer_p, sm%data_area, tr_row_size, tr_col_size, &
                                             send_displs(idata, myt, ui, dst_proc), bp, &
                                             scale_value)
               IF (sm%negate_real .AND. sm%negate_imaginary) THEN
                  CALL dbcsr_block_scale(data_buffer_p, scale=scale_neg_one, &
                                         row_size=nze, col_size=1, &
                                         lb=send_displs(idata, myt, ui, dst_proc))
               ELSEIF (sm%negate_real) THEN
                  CALL dbcsr_block_real_neg(data_buffer_p, row_size=nze, col_size=1, &
                                            lb=send_displs(idata, myt, ui, dst_proc))
               ELSEIF (sm%negate_imaginary) THEN
                  CALL dbcsr_block_conjg(data_buffer_p, row_size=nze, col_size=1, &
                                         lb=send_displs(idata, myt, ui, dst_proc))
               END IF
            ELSE
               CALL dbcsr_block_copy_aa(data_buffer_p, sm%data_area, row_size, col_size, &
                                        send_displs(idata, myt, ui, dst_proc), bp, &
                                        scale_value)
            END IF
            !
            ! Apply cropping for partial blocks
            IF (do_part_crop_row .OR. do_part_crop_col) THEN
               CALL dbcsr_data_set_pointer( &
                  area=data_block, &
                  rsize=row_size, &
                  csize=col_size, &
                  pointee=data_buffer_p, &
                  source_lb=send_displs(idata, myt, ui, dst_proc))
               IF (do_part_crop_row) THEN
                  IF (do_part_crop_f_row .AND. stored_row .EQ. block_row_bounds(1)) THEN
                     CALL dbcsr_data_clear(data_block, ub=f_row_f)
                  END IF
                  IF (do_part_crop_l_row .AND. stored_row .EQ. block_row_bounds(2)) THEN
                     CALL dbcsr_data_clear(data_block, lb=l_row_l)
                  END IF
               END IF
               IF (do_part_crop_col) THEN
                  IF (do_part_crop_f_col .AND. stored_col .EQ. block_col_bounds(1)) THEN
                     CALL dbcsr_data_clear(data_block, ub2=f_col_f)
                  END IF
                  IF (do_part_crop_l_col .AND. stored_col .EQ. block_col_bounds(2)) THEN
                     CALL dbcsr_data_clear(data_block, lb2=l_col_l)
                  END IF
               END IF
            END IF
            !
            ! Set meta data (global or local indexing)
            IF (dst_proc .EQ. mynode) THEN
               stored_row = local_g2l_map_rows(stored_row)
               stored_col = local_g2l_map_cols(stored_col)
               ! Count the maximum possible multiplies per row for on-the-fly filtering
               IF (is_left .AND. otf_filtering) THEN
                  left_total_row_counts(stored_row) = &
                     left_total_row_counts(stored_row) + 1
               END IF
            END IF
            meta_buffer_p(send_displs(imeta, myt, ui, dst_proc)) = stored_row
            meta_buffer_p(send_displs(imeta, myt, ui, dst_proc) + 1) = stored_col
            meta_buffer_p(send_displs(imeta, myt, ui, dst_proc) + 2) = offset_data(myt, ui, dst_proc)
            !
            send_displs(imeta, myt, ui, dst_proc) = send_displs(imeta, myt, ui, dst_proc) + 3
            send_displs(idata, myt, ui, dst_proc) = send_displs(idata, myt, ui, dst_proc) + nze
            offset_data(myt, ui, dst_proc) = offset_data(myt, ui, dst_proc) + nze
!$          IF (is_left .AND. do_symmetry) THEN
!$             call omp_unset_lock(locks(myt))
!$          END IF
         END DO
      END DO
      CALL dbcsr_iterator_stop(iter)
      CALL timestop(handle2)
      !
      IF (do_part_crop_row .OR. do_part_crop_col) THEN
         CALL dbcsr_data_clear_pointer(data_block)
         CALL dbcsr_data_release(data_block)
      END IF
      !
!$OMP BARRIER
!$OMP MASTER
      !
      ! Allocate data/meta to recv
      IF (dbcsr_data_valid(make_buffers_data_recv)) THEN
         IF (dbcsr_data_get_type(make_buffers_data_recv) .NE. data_type) THEN
            CALL dbcsr_data_release(make_buffers_data_recv)
         END IF
      END IF
      IF (dbcsr_data_valid(make_buffers_data_recv)) THEN
         CALL dbcsr_data_ensure_size(make_buffers_data_recv, my_size_recv(idata) - 1, nocopy=.TRUE.)
      ELSE
         CALL dbcsr_data_init(make_buffers_data_recv)
         CALL dbcsr_data_new(make_buffers_data_recv, data_type, my_size_recv(idata) - 1, &
                             memory_type=memtype_mpi_buffer)
      END IF
      CALL ensure_array_size(make_buffers_meta_recv, ub=my_size_recv(imeta) - 1, &
                             nocopy=.TRUE., memory_type=memtype_mpi_buffer)
      ! Exchange data
      CALL timeset(routineN//"_data", handle2)
      CALL hybrid_alltoall_any(make_buffers_data_send, send_size_proc(idata, :), send_displ_proc(idata, :), &
                               make_buffers_data_recv, recv_size_proc(idata, :), recv_displ_proc(idata, :), &
                               mp_obj, &
                               most_ptp=.TRUE., remainder_ptp=.TRUE., no_hybrid=.FALSE.)
      CALL hybrid_alltoall_i1(make_buffers_meta_send, send_size_proc(imeta, :), send_displ_proc(imeta, :), &
                              make_buffers_meta_recv, recv_size_proc(imeta, :), recv_displ_proc(imeta, :), &
                              mp_obj, &
                              most_ptp=.TRUE., remainder_ptp=.TRUE., no_hybrid=.FALSE.)
      CALL timestop(handle2)
!$OMP END MASTER
!$OMP BARRIER
!$    IF (is_left .AND. do_symmetry) THEN
!$       call omp_destroy_lock(locks(ithread))
!$    END IF
      !
      ! Arrange data in the local buffers in images
      data_buffer_p => buffer%data
      meta_buffer_p => buffer%meta
      DO ui = 1, nimages
         ! Check for empty images
         IF (local_images_size(imeta, ui) .EQ. 0) CYCLE
         DO dst_proc = 0, nprocs_total - 1
            IF (recv_sizes(imeta, ithread, ui, dst_proc) .EQ. 0) CYCLE
            ! Skip local data
            IF (dst_proc .EQ. mynode) THEN
               offset_threads(:, ithread, ui) = offset_threads(:, ithread, ui) + &
                                                recv_sizes(:, ithread, ui, dst_proc)
            ELSE
               ! Copy meta, block by block
               DO blk = recv_displs(imeta, ithread, ui, dst_proc), &
                  recv_displs(imeta, ithread, ui, dst_proc) + recv_sizes(imeta, ithread, ui, dst_proc) - 1, 3
                  stored_row = local_g2l_map_rows(make_buffers_meta_recv(blk))
                  stored_col = local_g2l_map_cols(make_buffers_meta_recv(blk + 1))
                  meta_buffer_p(offset_threads(imeta, ithread, ui)) = stored_row
                  meta_buffer_p(offset_threads(imeta, ithread, ui) + 1) = stored_col
                  meta_buffer_p(offset_threads(imeta, ithread, ui) + 2) = make_buffers_meta_recv(blk + 2) + &
                                                                          offset_threads(idata, ithread, ui)
                  offset_threads(imeta, ithread, ui) = offset_threads(imeta, ithread, ui) + 3
                  ! Count the maximum possible multiplies per row for on-the-fly filtering
                  IF (is_left .AND. otf_filtering) THEN
!$OMP ATOMIC
                     left_total_row_counts(stored_row) = &
                        left_total_row_counts(stored_row) + 1
                  END IF
               END DO
               ! Copy data
               CALL dbcsr_data_set(data_buffer_p, &
                                   offset_threads(idata, ithread, ui) + local_images_displ(idata, ui) + 1, &
                                   recv_sizes(idata, ithread, ui, dst_proc), &
                                   make_buffers_data_recv, recv_displs(idata, ithread, ui, dst_proc))
               offset_threads(idata, ithread, ui) = offset_threads(idata, ithread, ui) + &
                                                    recv_sizes(idata, ithread, ui, dst_proc)
            END IF
         END DO
      END DO
!$OMP END PARALLEL
      DEALLOCATE (send_sizes, recv_sizes)
      DEALLOCATE (send_displs, recv_displs, offset_data, offset_threads)
      DEALLOCATE (send_size_proc, send_displ_proc, recv_size_proc, recv_displ_proc)
      !
      IF (is_left .AND. otf_filtering) THEN
         CALL mp_isum(left_total_row_counts, dbcsr_mp_my_row_group(mp_obj), request_count_rows)
      END IF
      !
      CALL setup_rec_index_images(buffer%meta, img_nblks_rows, img_nblks_cols, &
                                  local_images_size(imeta, :), local_images_displ(imeta, :), &
                                  size_index, is_left)
      IF (buffer%has_rma_win) THEN
         do_win_create(1) = do_win_create(1) .OR. dbcsr_data_exists(buffer%data_before_resize)
         do_win_create(2) = do_win_create(2) .OR. ASSOCIATED(buffer%meta_before_resize)
         CALL mp_isum(do_win_create, buffer%subgrp, requests_win_create(irequests))
      END IF
      !
      IF (is_left) THEN
         NULLIFY (local_g2l_map_rows)
         DEALLOCATE (local_g2l_map_cols)
      ELSE
         DEALLOCATE (local_g2l_map_rows)
         NULLIFY (local_g2l_map_cols)
      END IF
!$    IF (is_left .AND. do_symmetry) THEN
!$       DEALLOCATE (locks)
!$    END IF
      !
      DEALLOCATE (img_nblks_rows, img_nblks_cols)
      DEALLOCATE (local_images_displ)
      !
      CALL timestop(handle)
   END SUBROUTINE make_buffers

   SUBROUTINE make_layers_3D_AB(my_num_layers_3D, side3D, mp_obj, is_left, buffer)
      !! Make communicators for A and B matrices
      INTEGER, INTENT(IN)                                :: my_num_layers_3D, side3D
      TYPE(dbcsr_mp_obj), INTENT(IN)                     :: mp_obj
      LOGICAL, INTENT(IN)                                :: is_left
      TYPE(dbcsr_buffer), INTENT(INOUT)                  :: buffer

      INTEGER                                            :: color, key, mypcol, myprow
      TYPE(mp_comm_type)                                 :: mygrp

      ! Switch to single layer communicator
      IF (my_num_layers_3D .LE. 1) THEN
         IF (buffer%num_layers_3D .GT. 1 .AND. buffer%subgrp .NE. mp_comm_null) &
            CALL mp_comm_free(buffer%subgrp)
         buffer%num_layers_3D = 1
         IF (is_left) THEN
            buffer%subgrp = dbcsr_mp_my_row_group(mp_obj)
         ELSE
            buffer%subgrp = dbcsr_mp_my_col_group(mp_obj)
         END IF
         RETURN
      END IF
      !
      ! Check if any existing 3D communicator can be reused
      mygrp = dbcsr_mp_group(mp_obj)
      IF (buffer%grp .EQ. mygrp .AND. buffer%num_layers_3D .EQ. my_num_layers_3D) RETURN
      !
      ! Reset previous 3D communicator
      IF (buffer%num_layers_3D .GT. 1 .AND. buffer%subgrp .NE. mp_comm_null) &
         CALL mp_comm_free(buffer%subgrp)
      !
      myprow = dbcsr_mp_myprow(mp_obj)
      mypcol = dbcsr_mp_mypcol(mp_obj)
      IF (is_left) THEN
         color = MOD(myprow, side3D)
         ! Column-major order
         key = mypcol*(dbcsr_mp_nprows(mp_obj)/side3D) + myprow/side3D
      ELSE
         color = MOD(mypcol, side3D)
         ! Row-major order
         key = myprow*(dbcsr_mp_npcols(mp_obj)/side3D) + mypcol/side3D
      END IF
      CALL mp_comm_split_direct(mygrp, buffer%subgrp, color, key)
      buffer%num_layers_3D = my_num_layers_3D
   END SUBROUTINE make_layers_3D_AB

   PURE FUNCTION get_rank3D(myprow, mypcol, nprows, side3D)
      !! Return the rank of the 3D layer (3D communicator for C), Column-major order
      INTEGER, INTENT(IN)                                :: myprow, mypcol, nprows, side3D
      INTEGER                                            :: get_rank3D

      get_rank3D = myprow/side3D + (nprows/side3D)*(mypcol/side3D)
   END FUNCTION get_rank3D

   SUBROUTINE make_layers_3D_C_reduction(my_num_layers_3D, mp_obj)
      !! Make communicators for 3D layers for C-reduction
      INTEGER, INTENT(IN)                                :: my_num_layers_3D
      TYPE(dbcsr_mp_obj), INTENT(INOUT)                  :: mp_obj

      CHARACTER(len=100)                                 :: msg
      INTEGER                                            :: color, key, mypcol, myprow, &
                                                            npcols, nprows, numnodes
      LOGICAL                                            :: do_layers_3D
      LOGICAL, SAVE                                      :: warning = .TRUE.
      TYPE(mp_comm_type)                                 :: mygrp

      CALL dbcsr_mp_grid_setup(mp_obj)
      IF (my_num_layers_3D .LE. 1) THEN
         ! Reset 3D communicator if it was previously declared
         IF (layers_3D_C_reduction%num_layers_3D .GT. 1) CALL release_layers_3D_C_reduction()
         RETURN
      END IF
      !
      ! Check if any existing 3D communicator can be reused
      mygrp = dbcsr_mp_group(mp_obj)
      IF (layers_3D_C_reduction%grp .EQ. mygrp .AND. &
          layers_3D_C_reduction%num_layers_3D .EQ. my_num_layers_3D) RETURN
      !
      ! Reset 3D communicator
      CALL release_layers_3D_C_reduction()
      !
      ! Checks for 3D algorithm
      numnodes = dbcsr_mp_numnodes(mp_obj)
      nprows = dbcsr_mp_nprows(mp_obj)
      npcols = dbcsr_mp_npcols(mp_obj)
      IF (dbcsr_cfg%use_mpi_rma%val) THEN
         IF (nprows .NE. npcols) THEN
            ! No square topology, scale the maximum coordinate
            do_layers_3D = MAX(nprows, npcols) .EQ. (my_num_layers_3D*MIN(nprows, npcols)) .AND. &
                           my_num_layers_3D .LE. MIN(nprows, npcols)
         ELSE
            ! Square topology, scale both coordinates
            do_layers_3D = ((nprows/NINT(SQRT(REAL(MAX(1, my_num_layers_3D), KIND=real_8))))**2)* &
                           my_num_layers_3D .EQ. (nprows*npcols)
         END IF
         IF (.NOT. do_layers_3D .AND. warning) THEN
            WRITE (UNIT=msg, FMT='(A,I3,A,I3,A,I3,A)') "Cannot make 3D layers with ", my_num_layers_3D, &
               " layers and (", nprows, "x", npcols, ") ranks! Run with a single layer."
            DBCSR_WARN(msg)
            warning = .FALSE.
         END IF
         IF (do_layers_3D) THEN
            layers_3D_C_reduction%grp = mygrp
            layers_3D_C_reduction%num_layers_3D = my_num_layers_3D
            layers_3D_C_reduction%max_num_layers_3D = &
               MAX(layers_3D_C_reduction%max_num_layers_3D, &
                   my_num_layers_3D)
            layers_3D_C_reduction%side3D = NINT(SQRT(REAL(numnodes/my_num_layers_3D, KIND=real_8)))
            !
            ! Create a new 3D communicator
            myprow = dbcsr_mp_myprow(mp_obj)
            mypcol = dbcsr_mp_mypcol(mp_obj)
            ! Row-wise order for color
            color = MOD(myprow, layers_3D_C_reduction%side3D)* &
                    layers_3D_C_reduction%side3D + MOD(mypcol, layers_3D_C_reduction%side3D)
            ! Column-major order
            key = get_rank3D(myprow, mypcol, nprows, layers_3D_C_reduction%side3D)
            CALL mp_comm_split_direct(mygrp, layers_3D_C_reduction%grp3D, color, key)
            !
            ! Create a 3D-row communicator based on the 3D communicator
            color = key/(nprows/layers_3D_C_reduction%side3D)
            CALL mp_comm_split_direct(layers_3D_C_reduction%grp3D, &
                                      layers_3D_C_reduction%rowgrp3D, color, key)
         END IF
      ELSE
         DBCSR_WARN('Cannot make 3D layers without experimental MPI algorithm enabled!')
      END IF
   END SUBROUTINE make_layers_3D_C_reduction

   SUBROUTINE release_layers_3D_C_reduction(release_buffers)
      !! Release communicators for 3D layers for C-reduction
      LOGICAL, OPTIONAL                                  :: release_buffers

      INTEGER                                            :: ibuff

      layers_3D_C_reduction%grp = mp_comm_null
      IF (layers_3D_C_reduction%rowgrp3D .NE. mp_comm_null) CALL mp_comm_free(layers_3D_C_reduction%rowgrp3D)
      IF (layers_3D_C_reduction%grp3D .NE. mp_comm_null) CALL mp_comm_free(layers_3D_C_reduction%grp3D)
      layers_3D_C_reduction%rowgrp3D = mp_comm_null
      layers_3D_C_reduction%grp3D = mp_comm_null
      layers_3D_C_reduction%num_layers_3D = 1
      layers_3D_C_reduction%side3D = HUGE(1)

      IF (PRESENT(release_buffers)) THEN
         IF (release_buffers .AND. ALLOCATED(layers_3D_C_reduction%data_red3D)) THEN
            DO ibuff = 1, SIZE(layers_3D_C_reduction%data_red3D)
               CALL dbcsr_data_release(layers_3D_C_reduction%data_red3D(ibuff))
            END DO
            DEALLOCATE (layers_3D_C_reduction%data_red3D)
         END IF
      END IF
   END SUBROUTINE release_layers_3D_C_reduction

   SUBROUTINE multiply_3D(imgdist_left, imgdist_right, &
                          matrix_left, matrix_right, &
                          product_matrix, &
                          retain_sparsity, &
                          filter_eps, flop, keep_product_data)
      !! Multiplies two DBCSR matrices (experimental MPI algorithm).
      !! This algorithm is experimental and it should be not used in
      !! production runs.

      TYPE(dbcsr_imagedistribution_obj), INTENT(INOUT)   :: imgdist_left, imgdist_right
      TYPE(dbcsr_type), INTENT(IN)                       :: matrix_left, matrix_right
      TYPE(dbcsr_type), INTENT(INOUT), TARGET            :: product_matrix
         !! DBCSR product matrix
      LOGICAL, INTENT(IN), OPTIONAL                      :: retain_sparsity
         !! retain the sparsity of the existing product matrix; default is no
      REAL(kind=real_8), INTENT(IN), OPTIONAL            :: filter_eps
      INTEGER(KIND=int_8), INTENT(OUT)                   :: flop
         !! effective flop
      LOGICAL, INTENT(IN)                                :: keep_product_data

      CHARACTER(len=*), PARAMETER :: routineN = 'multiply_3D'
      INTEGER :: blk, data_type, data_type_byte, final_step_k, handle, &
                 handle1, handle2, icol3D, icol3D_send, ileft_buffer_calc, ileft_buffer_comm, &
                 iright_buffer_calc, iright_buffer_comm, irow3D, irow3D_send, istep_k_ordered, ithread, &
                 ivirt_k, last_step_k, left_col_mult, left_col_nimages, left_col_total_nimages, &
                 left_max_data_size, left_max_meta_size, left_myfirstvcol, left_myfirstvrow, left_mypcol, &
                 left_myprow, left_npcols, left_nprows, left_row_mult, left_row_nimages, &
                 leftovers_first_k, leftovers_k, leftovers_shift_k, leftovers_start_k, min_nimages, &
                 mycol3D, mypcol, myprow
      INTEGER :: myrank3D, myrow3D, myt, nblkrows_local, nbuffers, nbuffers_norms, ncols3D, &
                 nranks3D, nrows3D, nthreads, numnodes, nvirt_k, proc3D_recv, proc3D_send, recv_vcol, &
                 recv_vrow, right_col_mult, right_col_nimages, &
                 right_max_data_size, right_max_meta_size, right_myfirstvcol, right_myfirstvrow, &
                 right_mypcol, right_myprow, right_npcols, right_nprows, right_row_mult, &
                 right_row_nimages, right_row_total_nimages, row, shift3D, shift3D_comm, shift3D_recv, &
                 size_guess, size_guess_init, start_k, start_k_ordered, v_ki
      INTEGER(KIND=int_8)                                :: mem
      INTEGER, ALLOCATABLE, DIMENSION(:) :: left_vrow, product_matrix_epss_displ, &
                                            product_matrix_epss_size, product_matrix_meta, product_matrix_size_recv, &
                                            product_matrix_size_send, right_vcol
      INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: product_matrix_meta_displ, &
                                                            product_matrix_meta_size
      TYPE(mp_request_type)                              :: request_epss, request_keep_sparsity
      TYPE(mp_request_type), DIMENSION(2)                :: requests_reduction_size
      TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:)   :: requests_reduction
      INTEGER, DIMENSION(:), POINTER, CONTIGUOUS :: col_blk_sizes2enum, enum2col_blk_sizes, &
                                                    enum2row_blk_sizes, product_matrix_meta_recv, product_matrix_meta_send, &
                                                    row_blk_sizes2enum
      INTEGER, DIMENSION(:), POINTER, CONTIGUOUS         :: k_sizes
      INTEGER, DIMENSION(:, :, :), POINTER, CONTIGUOUS   :: left_displ_layers3D, &
                                                            left_images_size_layers3D, &
                                                            right_displ_layers3D, &
                                                            right_images_size_layers3D
      INTEGER, DIMENSION(dbcsr_slot_nblkrows_total: &
                         dbcsr_slot_nfullcols_local)                     :: left_global_indices, right_global_indices
      INTEGER, POINTER                                   :: istep_k_comm
      INTEGER, TARGET                                    :: istep_k, istep_k_comm_curr
      LOGICAL                                            :: do_layers3D, do_square_layers3D, &
                                                            first_k, first_v_k, is_not_comm, &
                                                            keep_sparsity, otf_filtering
      LOGICAL, ALLOCATABLE, DIMENSION(:)                 :: do_comm_left, do_comm_right
      REAL(kind=sp)                                      :: filter_eps_sp
      REAL(kind=sp), ALLOCATABLE, DIMENSION(:), TARGET   :: row_max_epss
      REAL(kind=sp), ALLOCATABLE, DIMENSION(:, :)        :: left_norms, right_norms
      REAL(kind=sp), DIMENSION(:), POINTER, CONTIGUOUS   :: product_matrix_epss
      TYPE(dbcsr_2d_array_obj)                           :: product_matrix3D
      TYPE(dbcsr_buffer), ALLOCATABLE, DIMENSION(:), &
         TARGET                                          :: left_buffers, right_buffers
      TYPE(dbcsr_buffer), POINTER                        :: left_buffer_p, right_buffer_p
      TYPE(dbcsr_data_obj)                               :: data_get, data_send
      TYPE(dbcsr_mm_multrec_type_p), ALLOCATABLE, &
         DIMENSION(:, :, :)                              :: multrec
      TYPE(dbcsr_mp_obj)                                 :: left_mp_obj, product_mp_obj, right_mp_obj
      TYPE(mn_local_sizes), ALLOCATABLE, DIMENSION(:)    :: m_sizes, n_sizes
      TYPE(mp_comm_type)                                 :: grp_left, grp_right

      CALL timeset(routineN, handle)
      !
      NULLIFY (row_blk_sizes2enum, enum2row_blk_sizes)
      NULLIFY (col_blk_sizes2enum, enum2col_blk_sizes)
      NULLIFY (k_sizes)
      !
      IF (PRESENT(retain_sparsity)) THEN
         keep_sparsity = retain_sparsity
      ELSE
         keep_sparsity = .FALSE.
      END IF
      otf_filtering = PRESENT(filter_eps)
      !
!$OMP PARALLEL DEFAULT (NONE) &
!$OMP SHARED (nthreads)
!$OMP MASTER
      nthreads = 1
!$    nthreads = OMP_GET_NUM_THREADS()
!$OMP END MASTER
!$OMP END PARALLEL
      !
      ! Dummy checks
      IF (.NOT. ASSOCIATED(product_matrix%wms)) &
         DBCSR_ABORT("Work matrices do not exist")
      IF (SIZE(product_matrix%wms) .NE. nthreads) &
         DBCSR_ABORT("Work matrices not correctly sized.")
      IF (.NOT. buffers_win%left%is_valid .OR. &
          .NOT. buffers_win%right%is_valid .OR. &
          .NOT. ASSOCIATED(buffers_win%left%meta) .OR. &
          .NOT. ASSOCIATED(buffers_win%right%meta) .OR. &
          .NOT. ASSOCIATED(left_images_size) .OR. &
          .NOT. ASSOCIATED(right_images_size) .OR. &
          .NOT. ALLOCATED(left_local_images_size) .OR. &
          .NOT. ALLOCATED(right_local_images_size)) &
         DBCSR_ABORT("No buffers associated for the experimental algo!")
      !
      ! Set up variables
      flop = 0
      data_type = dbcsr_get_data_type(product_matrix)
      data_type_byte = dbcsr_datatype_sizeof(data_type)
      left_row_nimages = imgdist_left%i%row_decimation
      left_row_mult = imgdist_left%i%row_multiplicity
      left_col_nimages = imgdist_left%i%col_decimation
      left_col_mult = imgdist_left%i%col_multiplicity
      right_row_nimages = imgdist_right%i%row_decimation
      right_row_mult = imgdist_right%i%row_multiplicity
      right_col_nimages = imgdist_right%i%col_decimation
      right_col_mult = imgdist_right%i%col_multiplicity
      left_mp_obj = dbcsr_distribution_mp(imgdist_left%i%main)
      right_mp_obj = dbcsr_distribution_mp(imgdist_right%i%main)
      product_mp_obj = dbcsr_distribution_mp(product_matrix%dist)
      numnodes = dbcsr_mp_numnodes(product_mp_obj)
      myprow = dbcsr_mp_myprow(product_mp_obj)
      mypcol = dbcsr_mp_mypcol(product_mp_obj)
      left_nprows = dbcsr_mp_nprows(left_mp_obj)
      left_npcols = dbcsr_mp_npcols(left_mp_obj)
      left_myprow = dbcsr_mp_myprow(left_mp_obj)
      left_mypcol = dbcsr_mp_mypcol(left_mp_obj)
      left_myfirstvrow = MOD(left_myprow, layers_3D_C_reduction%side3D)*left_row_nimages
      left_myfirstvcol = MOD(left_mypcol, layers_3D_C_reduction%side3D)*left_col_nimages
      right_nprows = dbcsr_mp_nprows(right_mp_obj)
      right_npcols = dbcsr_mp_npcols(right_mp_obj)
      right_myprow = dbcsr_mp_myprow(right_mp_obj)
      right_mypcol = dbcsr_mp_mypcol(right_mp_obj)
      right_myfirstvrow = MOD(right_myprow, layers_3D_C_reduction%side3D)*right_row_nimages
      right_myfirstvcol = MOD(right_mypcol, layers_3D_C_reduction%side3D)*right_col_nimages
      left_col_total_nimages = left_npcols*left_col_nimages
      right_row_total_nimages = right_nprows*right_row_nimages
      grp_right = buffers_win%right%subgrp
      grp_left = buffers_win%left%subgrp
      !
      do_layers3D = layers_3D_C_reduction%num_layers_3D .GT. 1
      myrow3D = myprow/layers_3D_C_reduction%side3D + 1
      mycol3D = mypcol/layers_3D_C_reduction%side3D + 1
      nrows3D = SIZE(left_images_size, 3)
      ncols3D = SIZE(right_images_size, 3)
      myrank3D = get_rank3D(myprow, mypcol, dbcsr_mp_nprows(product_mp_obj), layers_3D_C_reduction%side3D)
      nranks3D = layers_3D_C_reduction%num_layers_3D
      myprow = MOD(myprow, layers_3D_C_reduction%side3D)
      mypcol = MOD(mypcol, layers_3D_C_reduction%side3D)
      !
      ! Dummy checks
      ! subcommunicators
      IF (.NOT. dbcsr_mp_has_subgroups(right_mp_obj)) &
         DBCSR_ABORT("Experimental algorithm requires rows subcommunicators for right matrix!")
      IF (.NOT. dbcsr_mp_has_subgroups(left_mp_obj)) &
         DBCSR_ABORT("Experimental algorithm requires columns subcommunicators for left matrix!")
      ! Right col nimages
      IF (right_col_nimages .NE. 1) &
         DBCSR_ABORT("Col nimages for right matrix is not 1!")
      ! Left row nimages
      IF (left_row_nimages .NE. 1) &
         DBCSR_ABORT("Row nimages for left matrix is not 1!")
      ! left/right matching
      IF (left_col_nimages .NE. right_row_mult) &
         DBCSR_ABORT("Left/Right image mismatch")
      IF (left_col_mult .NE. right_row_nimages) &
         DBCSR_ABORT("Left/Right image mismatch")
      IF (left_col_nimages*left_npcols .NE. right_row_nimages*right_nprows) &
         DBCSR_ABORT("Left/Right total mismatch")
      ! product/left matching
      IF (left_row_mult*dbcsr_mp_nprows(product_mp_obj) .NE. left_nprows) &
         DBCSR_ABORT("Product/Left total mismatch")
      ! product/left matching
      IF (right_col_mult*dbcsr_mp_npcols(product_mp_obj) .NE. right_npcols) &
         DBCSR_ABORT("Product/Right total mismatch")
      ! Check sizes from make_buffers
      IF (SIZE(left_images_size, 2) .NE. left_col_nimages .OR. &
          SIZE(right_images_size, 2) .NE. right_row_nimages) &
         DBCSR_ABORT("Mismatch in the sizes")
      !
      dbcsr_mpi_statistics%nimages = MAX(dbcsr_mpi_statistics%nimages, left_col_nimages)
      dbcsr_mpi_statistics%nimages = MAX(dbcsr_mpi_statistics%nimages, right_row_nimages)
      !
      ! The main transfer loop goes through the virtual rows/columns.
      ! The number of steps may be smaller if the grid dimension is very
      ! non-optimal (both left column images and right row images are >
      ! 1).
      min_nimages = MIN(left_col_nimages, right_row_nimages)
      nvirt_k = left_npcols*left_col_nimages
      !
      ! Check RMA windows creation for original data
      CALL win_setup(buffers_win%left, do_win_create_left, requests_win_create(1))
      CALL win_setup(buffers_win%right, do_win_create_right, requests_win_create(2))
      !
      ! Count the maximum possible multiplies per row for on-the-fly filtering
      ALLOCATE (product_matrix_epss_size(nrows3D), product_matrix_epss_displ(nrows3D))
      IF (otf_filtering) THEN
         ! Wait for counts (sent in make_buffers)
         CALL timeset(routineN//"_count_rows", handle1)
         CALL mp_wait(request_count_rows)
         !
         nblkrows_local = SIZE(left_total_row_counts)
         ALLOCATE (row_max_epss(nblkrows_local))
         filter_eps_sp = REAL(filter_eps, KIND=KIND(row_max_epss))
!$OMP PARALLEL DO DEFAULT (NONE) &
!$OMP SHARED(nblkrows_local,row_max_epss,filter_eps_sp,&
!$OMP        left_total_row_counts)
         ! Determine the maximum per-block epsilon
         DO row = 1, nblkrows_local
            row_max_epss(row) = &
               (filter_eps_sp/REAL(MAX(1, left_total_row_counts(row)), KIND=KIND(row_max_epss)))**2
         END DO
!$OMP END PARALLEL DO
         DEALLOCATE (left_total_row_counts)
         !
         IF (do_layers3D .AND. nrows3D .GT. 1) THEN
            CALL mp_allgather(SIZE(row_max_epss), &
                              product_matrix_epss_size, &
                              layers_3D_C_reduction%rowgrp3D)
            size_guess = 0
            DO irow3D = 1, nrows3D
               product_matrix_epss_displ(irow3D) = size_guess
               size_guess = size_guess + product_matrix_epss_size(irow3D)
            END DO
            ALLOCATE (product_matrix_epss(size_guess))
            CALL mp_iallgather(row_max_epss, &
                               product_matrix_epss, product_matrix_epss_size, product_matrix_epss_displ, &
                               layers_3D_C_reduction%rowgrp3D, request_epss)
         ELSE
            product_matrix_epss_size(nrows3D) = SIZE(row_max_epss)
            product_matrix_epss_displ(nrows3D) = 0
            product_matrix_epss => row_max_epss
         END IF
         CALL timestop(handle1)
      ELSE
         product_matrix_epss_size(:) = 0
         product_matrix_epss_displ(:) = 0
         ALLOCATE (product_matrix_epss(0))
      END IF
      !
      ! Exchange 3D meta for C matrix
      IF (do_layers3D .AND. keep_sparsity) THEN
         ALLOCATE (product_matrix_meta_size(nrows3D, ncols3D))
         CALL mp_allgather(product_matrix%index(dbcsr_slot_size), &
                           product_matrix_meta_size, layers_3D_C_reduction%grp3D)
         ALLOCATE (product_matrix_meta_displ(nrows3D, ncols3D))
         size_guess = 0
         DO icol3D = 1, ncols3D
            DO irow3D = 1, nrows3D
               product_matrix_meta_displ(irow3D, icol3D) = size_guess
               size_guess = size_guess + product_matrix_meta_size(irow3D, icol3D)
            END DO
         END DO
         ALLOCATE (product_matrix_meta(size_guess))
         product_matrix%index(dbcsr_slot_nblks) = product_matrix%nblks
         product_matrix%index(dbcsr_slot_nze) = product_matrix%nze
         CALL mp_iallgather(product_matrix%index(1:product_matrix%index(dbcsr_slot_size)), &
                            product_matrix_meta, product_matrix_meta_size, product_matrix_meta_displ, &
                            layers_3D_C_reduction%grp3D, request_keep_sparsity)
      END IF
      !
      ! Wait refs and max norms (sent in make_buffers)
      CALL timeset(routineN//"_sizes", handle1)
      CALL mp_waitall(requests)
      CALL timestop(handle1)
      DEALLOCATE (right_local_images_size, left_local_images_size)
      !
      ! Needs to remap refs for virtual coordinates 3D
      CALL remap_layers3D(left_images_size, left_images_size_layers3D, left_displ_layers3D, &
                          left_max_data_size, left_max_meta_size)
      CALL remap_layers3D(right_images_size, right_images_size_layers3D, right_displ_layers3D, &
                          right_max_data_size, right_max_meta_size)
      left_max_meta_size = left_max_meta_size + dbcsr_num_slots
      right_max_meta_size = right_max_meta_size + dbcsr_num_slots
      !
      do_square_layers3D = .FALSE.
      nbuffers_norms = 1
      IF (nvirt_k .EQ. 1) THEN
         nbuffers = 1
      ELSEIF (nrows3D .NE. ncols3D .OR. nranks3D .EQ. 1) THEN
         nbuffers = 2
      ELSE
         ! Note that nrows3D==ncols3D >= 2
         ! Last buffer is used as temporary for communications
         nbuffers = nrows3D + 1
         nbuffers_norms = nrows3D
         do_square_layers3D = .TRUE.
      END IF
      !
      ! update capacity of memory-pools
      IF (ASSOCIATED(memtype_abpanel_1%pool)) &
         CALL dbcsr_mempool_limit_capacity(memtype_abpanel_1%pool, &
                                           capacity=2)
      IF (ASSOCIATED(memtype_abpanel_2%pool)) &
         CALL dbcsr_mempool_limit_capacity(memtype_abpanel_2%pool, &
                                           capacity=2)
      IF (use_acc()) THEN
         ! enumerate the blocksizes to keep the following 2D-arrays small.
         CALL enumerate_blk_sizes(matrix_right%row_blk_size%low%data, &
                                  dbcsr_max_row_size(matrix_right), &
                                  row_blk_sizes2enum, enum2row_blk_sizes)
         CALL enumerate_blk_sizes(matrix_right%col_blk_size%low%data, &
                                  dbcsr_max_col_size(matrix_right), &
                                  col_blk_sizes2enum, enum2col_blk_sizes)
      END IF
      IF (nranks3D .GT. 1) THEN
         CALL dbcsr_mempool_limit_capacity(memtype_mpi_product%pool, &
                                           capacity=nranks3D - 1)
      END IF
      !
      ! Prepare buffers for computation
      IF (nvirt_k .GT. 1) THEN
         ! Right
         CALL buffer_init(buffers_2%right, data_type, &
                          right_max_data_size, &
                          right_max_meta_size, &
                          num_data=(nbuffers/2), &
                          data_memory_type=memtype_abpanel_2, &
                          trs_memory_type=memtype_trsbuffer_2)
         ! Left
         CALL buffer_init(buffers_2%left, data_type, &
                          left_max_data_size, &
                          left_max_meta_size, &
                          num_data=(nbuffers/2), &
                          data_memory_type=memtype_abpanel_2)
      END IF
      !
      ! Prepare buffers for communication
      ! Right
      CALL buffer_init(buffers_1%right, data_type, &
                       right_max_data_size, &
                       right_max_meta_size, &
                       num_data=(nbuffers - nbuffers/2), &
                       data_memory_type=memtype_abpanel_1, &
                       trs_memory_type=memtype_trsbuffer_1)
      ! Left
      CALL buffer_init(buffers_1%left, data_type, &
                       left_max_data_size, &
                       left_max_meta_size, &
                       num_data=(nbuffers - nbuffers/2), &
                       data_memory_type=memtype_abpanel_1)
      !
      CALL setup_buffers(buffers_1%right, buffers_2%right, &
                         right_buffers, nbuffers, &
                         right_max_data_size, &
                         right_max_meta_size, &
                         matrix_right, imgdist_right)
      CALL setup_buffers(buffers_1%left, buffers_2%left, &
                         left_buffers, nbuffers, &
                         left_max_data_size, &
                         left_max_meta_size, &
                         matrix_left, imgdist_left)
      !
      ! Setup the receive data pointers
      CALL dbcsr_data_init(data_get)
      CALL dbcsr_data_new(data_get, data_type)
      IF (do_layers3D) THEN
         CALL dbcsr_data_init(data_send)
         CALL dbcsr_data_new(data_send, data_type)
         ! Prepare buffers for 3D reduction
         IF (ALLOCATED(layers_3D_C_reduction%data_red3D)) THEN
            IF (SIZE(layers_3D_C_reduction%data_red3D) .LT. nthreads .OR. &
                layers_3D_C_reduction%data_type .NE. data_type) THEN
               DO myt = 1, SIZE(layers_3D_C_reduction%data_red3D)
                  CALL dbcsr_data_release(layers_3D_C_reduction%data_red3D(myt))
               END DO
               DEALLOCATE (layers_3D_C_reduction%data_red3D)
               layers_3D_C_reduction%data_type = 0
            END IF
         END IF
         IF (.NOT. ALLOCATED(layers_3D_C_reduction%data_red3D)) THEN
            ALLOCATE (layers_3D_C_reduction%data_red3D(nthreads))
            DO myt = 1, nthreads
               CALL dbcsr_data_init(layers_3D_C_reduction%data_red3D(myt))
               CALL dbcsr_data_new(layers_3D_C_reduction%data_red3D(myt), data_type)
            END DO
            layers_3D_C_reduction%data_type = data_type
         END IF
         ALLOCATE (product_matrix_size_send(nthreads + 1), product_matrix_size_recv(nthreads + 1))
         ALLOCATE (requests_reduction((nthreads + 1)*2))
      END IF
      !
      ! These values for meta data are used for global values
      right_global_indices(dbcsr_slot_nblkrows_total:dbcsr_slot_nfullcols_local) = &
         (/ &
         dbcsr_nblkrows_total(matrix_right), &
         dbcsr_nblkcols_total(matrix_right), &
         dbcsr_nfullrows_total(matrix_right), &
         dbcsr_nfullcols_total(matrix_right), &
         0, 0, &
         dbcsr_nfullrows_local(matrix_right), &
         dbcsr_nfullcols_local(matrix_right)/)
      left_global_indices(dbcsr_slot_nblkrows_total:dbcsr_slot_nfullcols_local) = &
         (/ &
         dbcsr_nblkrows_total(matrix_left), &
         dbcsr_nblkcols_total(matrix_left), &
         dbcsr_nfullrows_total(matrix_left), &
         dbcsr_nfullcols_total(matrix_left), &
         0, 0, &
         dbcsr_nfullrows_local(matrix_left), &
         dbcsr_nfullcols_local(matrix_left)/)
      !
      ! Evaluate sizes for workspaces
      size_guess_init = 1
      IF (.NOT. keep_sparsity .AND. use_acc()) THEN
         size_guess_init = product_matrix_size_guess(matrix_left, matrix_right, product_matrix, &
                                                     left_max_data_size, right_max_data_size, &
                                                     left_col_nimages, right_row_nimages, &
                                                     nthreads)
      END IF
      !
      ! Preallocate norms arrays
      IF (otf_filtering) THEN
         ALLOCATE (right_norms(right_max_meta_size/3, nbuffers_norms))
         ALLOCATE (left_norms(left_max_meta_size/3, nbuffers_norms))
         IF (do_layers3D .AND. nrows3D .GT. 1) THEN
            CALL mp_wait(request_epss)
            DEALLOCATE (row_max_epss)
         END IF
      ELSE
         ! The array must be valid when passed to called subroutines.
         ALLOCATE (right_norms(0, nbuffers_norms))
         ALLOCATE (left_norms(0, nbuffers_norms))
      END IF
      !
      IF (do_layers3D .AND. keep_sparsity) CALL mp_wait(request_keep_sparsity)
      !
      ALLOCATE (product_matrix3D%mats(nrows3D, ncols3D))
      DO icol3D = 1, ncols3D
         DO irow3D = 1, nrows3D
            NULLIFY (product_matrix3D%mats(irow3D, icol3D)%matrix)
         END DO
      END DO
      ALLOCATE (multrec(0:nthreads - 1, nrows3D, ncols3D))
      !
      ! Here is the main loop
      ! 3D multiplication
      !
      CALL timeset(routineN//"_loop", handle1)
      ! Take into account when ticks are not multiple of 3D layers
      leftovers_k = MOD(nvirt_k, nranks3D)
      leftovers_first_k = leftovers_k*myrank3D
      leftovers_start_k = 0
      leftovers_shift_k = 0
      IF (leftovers_k .GT. 0) THEN
         ! This is only for nrows3D==ncols3D
         leftovers_start_k = (nvirt_k/nrows3D - 1)*(myrank3D/nrows3D) - &
                             (leftovers_k/nrows3D - 1)*(myrank3D/nrows3D)
         leftovers_shift_k = nranks3D*(leftovers_k/nrows3D) - leftovers_k*(MOD(myrank3D, nrows3D) + 1)
      END IF
      ! Ticks bounds
      start_k = (nvirt_k/nranks3D)*myrank3D
      last_step_k = nvirt_k + leftovers_first_k
      final_step_k = last_step_k - nranks3D
      ! Shift layers to keep local layer as the last one in computation
      shift3D = (mycol3D - 1)*nrows3D + &
                (nrows3D - myrow3D + 1)*(1 - MOD(mycol3D, 2)) + myrow3D*MOD(mycol3D, 2)
      iright_buffer_comm = 0
      ileft_buffer_comm = 0
      ALLOCATE (do_comm_right(ncols3D), do_comm_left(nrows3D))
      ALLOCATE (right_vcol(ncols3D), left_vrow(nrows3D))
      ALLOCATE (m_sizes(nrows3D), n_sizes(ncols3D))
      irow3D_send = 0
      icol3D_send = 0
      first_k = .TRUE.
      first_v_k = .TRUE.
      istep_k_comm_curr = leftovers_first_k
      istep_k_comm => istep_k_comm_curr
      grouped_steps_index: DO istep_k = leftovers_first_k, last_step_k
         !
         ! Wait data.  Exclude the first iteration.
         wait: IF (istep_k .GT. leftovers_first_k) THEN
            IF (debug_mod) WRITE (*, '(1X,A)') routineN//" waiting for right and left"
            right_buffer_p => right_buffers(iright_buffer_calc)
            left_buffer_p => left_buffers(ileft_buffer_calc)
            IF (right_buffer_p%is_comm .AND. left_buffer_p%is_comm) THEN
               ! check if right matrix was already initialized
               IF (.NOT. right_buffer_p%matrix%valid) THEN
                  CALL timeset(routineN//"_comm_right", handle2)
                  CALL mp_waitall(right_buffer_p%get_requests(:))
                  CALL timestop(handle2)
               END IF
               ! check if left matrix was already initialized
               IF (.NOT. left_buffer_p%matrix%valid) THEN
                  CALL timeset(routineN//"_comm_left", handle2)
                  CALL mp_waitall(left_buffer_p%get_requests(:))
                  CALL timestop(handle2)
               END IF
            END IF
         END IF wait
         !
         ! Matrix transfer. Transfer in all but the last loop iteration.
         shift3D_comm = shift3D
         xfer: DO WHILE (istep_k_comm .LT. last_step_k)
            start_k_ordered = start_k
            istep_k_ordered = istep_k_comm
            ! Put leftovers ticks always first
            IF (leftovers_k .GT. 0) THEN
               IF (istep_k_comm .LT. leftovers_first_k + leftovers_k) THEN
                  start_k_ordered = leftovers_start_k
               ELSE
                  istep_k_ordered = istep_k_comm + leftovers_shift_k
               END IF
            END IF
            first_k = MOD(istep_k_ordered, nranks3D) .EQ. 0
            ivirt_k = istep_k_ordered/nranks3D
            IF (istep_k_comm .LT. leftovers_first_k + leftovers_k) THEN
               CALL row_col_3D_reflected(irow3D, icol3D, nrows3D, ncols3D, istep_k_ordered)
            ELSE
               CALL row_col_3D_reflected(irow3D, icol3D, nrows3D, ncols3D, shift3D)
               shift3D = shift3D + 1
            END IF
            !
            v_ki = MOD(ivirt_k, min_nimages)
            ! Reset communication flags at the first layer
            IF ((first_k .OR. istep_k_comm .EQ. leftovers_first_k) .AND. &
                istep_k_comm .EQ. istep_k_comm_curr) THEN
               do_comm_right(:) = .TRUE.
               do_comm_left(:) = .TRUE.
            END IF
            ! Take first image global virtual coordinates
            IF (v_ki .EQ. 0) THEN
               IF (istep_k_comm .GE. leftovers_first_k + leftovers_k) first_v_k = .FALSE.
               start_k_ordered = start_k_ordered + ivirt_k
            END IF
            IF (v_ki .EQ. 0 .OR. (first_v_k .AND. min_nimages .GT. 1)) THEN
               CALL image_calculator(imgdist_right, &
                                     vprow=recv_vrow, &
                                     vpcol=right_vcol(icol3D), &
                                     mypcol=mypcol, &
                                     myvprow=right_myfirstvrow, &
                                     myvpcol=right_myfirstvcol + (icol3D - 1)*layers_3D_C_reduction%side3D, &
                                     vprow_shift=start_k_ordered, &
                                     shifting='R')
               CALL image_calculator(imgdist_left, &
                                     vprow=left_vrow(irow3D), &
                                     vpcol=recv_vcol, &
                                     myprow=myprow, &
                                     myvprow=left_myfirstvrow + (irow3D - 1)*layers_3D_C_reduction%side3D, &
                                     myvpcol=left_myfirstvcol, &
                                     vpcol_shift=start_k_ordered, &
                                     shifting='L')
            END IF
            !
            ! Set coordinates
            IF (do_square_layers3D) THEN
               ! Use the temporary buffers for the communication of the first tick
               IF (first_k) THEN
                  iright_buffer_comm = nbuffers
                  ileft_buffer_comm = nbuffers
               ELSE
                  iright_buffer_comm = icol3D
                  ileft_buffer_comm = irow3D
               END IF
            ELSE
               IF (do_comm_right(icol3D)) THEN
                  iright_buffer_comm = MOD(iright_buffer_comm, nbuffers) + 1
               END IF
               IF (do_comm_left(irow3D)) THEN
                  ileft_buffer_comm = MOD(ileft_buffer_comm, nbuffers) + 1
               END IF
            END IF
            !
            ! Exit if data are already communicated
            IF (istep_k_comm .NE. istep_k_comm_curr) EXIT
            !
            right_buffer_p => right_buffers(iright_buffer_comm)
            left_buffer_p => left_buffers(ileft_buffer_comm)
            right_buffer_p%coord3D = icol3D
            left_buffer_p%coord3D = irow3D
            !
            ! First row, communicate right matrix
            IF (do_comm_right(icol3D)) THEN
               right_buffer_p%vprow = MOD(recv_vrow + v_ki, right_row_total_nimages)
               right_buffer_p%vpcol = right_vcol(icol3D)
               right_buffer_p%is_comm = .FALSE.
            END IF
            !
            is_not_comm = .TRUE.
            IF (right_images_size_layers3D(imeta, icol3D, right_buffer_p%vprow) .NE. 0) THEN
               ! First col, communicate left matrix
               IF (do_comm_left(irow3D)) THEN
                  left_buffer_p%vprow = left_vrow(irow3D)
                  left_buffer_p%vpcol = MOD(recv_vcol + v_ki, left_col_total_nimages)
                  left_buffer_p%is_comm = .FALSE.
               END IF
               !
               IF (left_images_size_layers3D(imeta, irow3D, left_buffer_p%vpcol) .NE. 0) THEN
                  ! Check if data is already communicated
                  is_not_comm = do_comm_right(icol3D) .OR. do_comm_left(irow3D)
                  IF (is_not_comm) THEN
                     ! Right
                     IF (do_comm_right(icol3D)) THEN
                        IF (use_acc()) THEN
                           CALL timeset(routineN//"_acc_sync_right", handle2)
                           CALL acc_event_synchronize(right_buffer_p%data%d%acc_ready)
                           CALL timestop(handle2)
                        END IF
                        !
                        do_comm_right(icol3D) = .FALSE.
                        CALL rma_transfer(right_buffer_p%vprow, right_row_nimages, &
                                          right_images_size_layers3D(:, icol3D, right_buffer_p%vprow), &
                                          right_displ_layers3D(:, icol3D, right_buffer_p%vprow), &
                                          right_buffer_p, &
                                          buffers_win%right%meta_win, buffers_win%right%data_win, &
                                          data_get, data_type_byte, buffers_win%right, icol3D, ncols3D)
                     END IF
                     ! Left
                     IF (do_comm_left(irow3D)) THEN
                        IF (use_acc()) THEN
                           CALL timeset(routineN//"_acc_sync_left", handle2)
                           CALL acc_event_synchronize(left_buffer_p%data%d%acc_ready)
                           CALL timestop(handle2)
                        END IF
                        !
                        do_comm_left(irow3D) = .FALSE.
                        CALL rma_transfer(left_buffer_p%vpcol, left_col_nimages, &
                                          left_images_size_layers3D(:, irow3D, left_buffer_p%vpcol), &
                                          left_displ_layers3D(:, irow3D, left_buffer_p%vpcol), &
                                          left_buffer_p, &
                                          buffers_win%left%meta_win, buffers_win%left%data_win, &
                                          data_get, data_type_byte, buffers_win%left, irow3D, nrows3D)
                     END IF
                  END IF
               END IF
            END IF
            !
            istep_k_comm_curr = istep_k_comm_curr + 1
            ! Stop looping when data is communicated
            ! Only works for 4 layers
            IF (is_not_comm .OR. nranks3D .NE. 4) THEN
               istep_k_comm => istep_k
               IF ((istep_k_comm + 1) .EQ. istep_k_comm_curr) EXIT
               ! Restore coordinates by looping once again
               shift3D = shift3D_comm
               CYCLE
            END IF
            ! Keep looping until it starts a new communication (only for 4 layers)
            istep_k_comm => istep_k_comm_curr
         END DO xfer
         !
         ! Create matrices and multrec's, only the first occurrence
         IF (.NOT. ASSOCIATED(product_matrix3D%mats(irow3D, icol3D)%matrix)) THEN
            IF (irow3D .EQ. myrow3D .AND. icol3D .EQ. mycol3D) THEN
               product_matrix3D%mats(irow3D, icol3D)%matrix => product_matrix
            ELSE
               ALLOCATE (product_matrix3D%mats(irow3D, icol3D)%matrix)
               IF (keep_sparsity) THEN
                  size_guess = product_matrix_meta(product_matrix_meta_displ(irow3D, icol3D) + &
                                                   dbcsr_slot_nze)
                  CALL setup_buffer_matrix(product_matrix3D%mats(irow3D, icol3D)%matrix, &
                                           product_matrix, product_matrix_meta_size(irow3D, icol3D), &
                                           data_size=size_guess, &
                                           data_memory_type=memtype_mpi_product)
                  product_matrix3D%mats(irow3D, icol3D)% &
                     matrix%index(1:product_matrix_meta_size(irow3D, icol3D)) = &
                     product_matrix_meta(product_matrix_meta_displ(irow3D, icol3D) + 1: &
                                         product_matrix_meta_displ(irow3D, icol3D) + &
                                         product_matrix_meta_size(irow3D, icol3D))
                  CALL dbcsr_data_clear(product_matrix3D%mats(irow3D, icol3D)%matrix%data_area, &
                                        ub=size_guess)
               ELSE
                  CALL setup_buffer_matrix(product_matrix3D%mats(irow3D, icol3D)%matrix, &
                                           product_matrix, data_memory_type=memtype_mpi_product)
               END IF
               product_matrix3D%mats(irow3D, icol3D)%matrix%index(dbcsr_slot_home_prow) = &
                  (irow3D - 1)*layers_3D_C_reduction%side3D + myprow
               product_matrix3D%mats(irow3D, icol3D)%matrix%index(dbcsr_slot_home_pcol) = &
                  (icol3D - 1)*layers_3D_C_reduction%side3D + mypcol
               CALL dbcsr_reset_locals(product_matrix3D%mats(irow3D, icol3D)%matrix)
               product_matrix3D%mats(irow3D, icol3D)%matrix%nblks = 0
               CALL dbcsr_repoint_index(product_matrix3D%mats(irow3D, icol3D)%matrix)
            END IF
            !
            IF (.NOT. ASSOCIATED(m_sizes(irow3D)%sizes)) THEN
               ALLOCATE (m_sizes(irow3D)%sizes(dbcsr_nblkrows_local(product_matrix3D%mats(irow3D, icol3D)%matrix)))
               CALL local_filter(array_data(product_matrix3D%mats(irow3D, icol3D)%matrix%row_blk_size), &
                                 array_size(product_matrix3D%mats(irow3D, icol3D)%matrix%local_rows), &
                                 array_data(product_matrix3D%mats(irow3D, icol3D)%matrix%local_rows), &
                                 m_sizes(irow3D)%sizes)
            END IF
            IF (.NOT. ASSOCIATED(n_sizes(icol3D)%sizes)) THEN
               ALLOCATE (n_sizes(icol3D)%sizes(dbcsr_nblkcols_local(product_matrix3D%mats(irow3D, icol3D)%matrix)))
               CALL local_filter(array_data(product_matrix3D%mats(irow3D, icol3D)%matrix%col_blk_size), &
                                 array_size(product_matrix3D%mats(irow3D, icol3D)%matrix%local_cols), &
                                 array_data(product_matrix3D%mats(irow3D, icol3D)%matrix%local_cols), &
                                 n_sizes(icol3D)%sizes)
            END IF
            !
!$OMP PARALLEL DEFAULT(NONE) &
!$OMP          PRIVATE (size_guess, ithread) &
!$OMP          SHARED (product_matrix3D, multrec, &
!$OMP                  keep_sparsity, filter_eps, &
!$OMP                  product_matrix_epss, &
!$OMP                  matrix_right, matrix_left, nthreads, &
!$OMP                  irow3D, icol3D, myrow3D, mycol3D, keep_product_data, &
!$OMP                  product_matrix_epss_displ, product_matrix_epss_size, &
!$OMP                  memtype_product_wm, size_guess_init, nranks3D, m_sizes, n_sizes)
            !
            ! Setup product work areas
            !
            ithread = 0
!$          ithread = OMP_GET_THREAD_NUM()
            !
            IF (irow3D .NE. myrow3D .OR. icol3D .NE. mycol3D) THEN
               IF (keep_product_data) THEN
                  CALL dbcsr_add_wm_from_matrix(product_matrix3D%mats(irow3D, icol3D)%matrix)
               ELSE
                  CALL dbcsr_work_create(product_matrix3D%mats(irow3D, icol3D)%matrix, &
                                         work_mutable=.FALSE., memory_type=memtype_product_wm(ithread)%p)
               END IF
!$OMP BARRIER
            END IF
            ! The work arrays have to be setup
            size_guess = product_matrix3D%mats(irow3D, icol3D)%matrix%wms(ithread + 1)%datasize ! Should be minimal
            IF (.NOT. keep_sparsity) THEN
               size_guess = MAX(size_guess, size_guess_init)
            END IF
            CALL dbcsr_data_ensure_size(product_matrix3D%mats(irow3D, icol3D)% &
                                        matrix%wms(ithread + 1)%data_area, &
                                        size_guess)
            CALL dbcsr_data_set_size_referenced(product_matrix3D%mats(irow3D, icol3D)% &
                                                matrix%wms(ithread + 1)%data_area, &
                                                product_matrix3D%mats(irow3D, icol3D)% &
                                                matrix%wms(ithread + 1)%datasize)
            CALL ensure_array_size(product_matrix3D%mats(irow3D, icol3D)% &
                                   matrix%wms(ithread + 1)%row_i, ub=1)
            CALL ensure_array_size(product_matrix3D%mats(irow3D, icol3D)% &
                                   matrix%wms(ithread + 1)%col_i, ub=1)
            CALL ensure_array_size(product_matrix3D%mats(irow3D, icol3D)% &
                                   matrix%wms(ithread + 1)%blk_p, ub=1)
            ALLOCATE (multrec(ithread, irow3D, icol3D)%p)
            CALL dbcsr_mm_multrec_init(multrec(ithread, irow3D, icol3D)%p, &
                                       product=product_matrix3D%mats(irow3D, icol3D)%matrix, &
                                       keep_sparsity=keep_sparsity, &
                                       eps=filter_eps, &
                                       row_max_epss=product_matrix_epss(product_matrix_epss_displ(irow3D) + 1: &
                                                                        product_matrix_epss_displ(irow3D) + &
                                                                        product_matrix_epss_size(irow3D)), &
                                       block_estimate=0, &
                                       right_row_blk_size=dbcsr_row_block_sizes(matrix_right), &
                                       m_sizes=m_sizes(irow3D)%sizes, n_sizes=n_sizes(icol3D)%sizes, &
                                       nlayers=nranks3D, &
                                       keep_product_data=keep_product_data)
!$OMP END PARALLEL
            !
            product_matrix3D%mats(irow3D, icol3D)%matrix%nblks = 0
            product_matrix3D%mats(irow3D, icol3D)%matrix%nze = 0
            product_matrix3D%mats(irow3D, icol3D)%matrix%row_p(:) = 0
            CALL dbcsr_data_set_size_referenced(product_matrix3D%mats(irow3D, icol3D)%matrix%data_area, 0)
            product_matrix3D%mats(irow3D, icol3D)%matrix%valid = .FALSE.
         END IF
         !
         ! Do the multiplication.  Exclude the first iteration.
         calc: IF (istep_k .GT. leftovers_first_k) THEN
            right_buffer_p => right_buffers(iright_buffer_calc)
            left_buffer_p => left_buffers(ileft_buffer_calc)
            irow3D = left_buffer_p%coord3D
            icol3D = right_buffer_p%coord3D
            IF (istep_k .GT. final_step_k) THEN
!$OMP PARALLEL DEFAULT (NONE) &
!$OMP SHARED (multrec, irow3D, icol3D, irow3D_send, icol3D_send, &
!$OMP         istep_k, final_step_k, product_matrix3D, &
!$OMP         handle2, requests_reduction_size, nthreads, &
!$OMP         product_matrix_meta_send, product_matrix_meta_recv, &
!$OMP         product_matrix_size_send, product_matrix_size_recv, &
!$OMP         buffers_win, data_send, data_get, proc3D_send, proc3D_recv, &
!$OMP         layers_3D_C_reduction, requests_reduction, &
!$OMP         dbcsr_mpi_statistics, data_type_byte) &
!$OMP PRIVATE (ithread)
               ithread = 0
!$             ithread = omp_get_thread_num()
               ! Prepare data to send for 3D layer
               IF (istep_k .GT. final_step_k + 1) THEN
                  CALL dbcsr_mm_multrec_finalize( &
                     multrec(ithread, irow3D_send, icol3D_send)%p, &
                     buffers_win%left%meta_red3D)
!$OMP BARRIER
!$OMP MASTER
                  CALL timeset(routineN//"_red3D_size", handle2)
                  CALL mp_waitall(requests_reduction_size)
                  CALL timestop(handle2)
                  CALL ensure_array_size(buffers_win%right%meta_red3D, &
                                         ub=product_matrix_size_recv(1), &
                                         nocopy=.TRUE.)
                  product_matrix_meta_send => &
                     buffers_win%left%meta_red3D(1:product_matrix_size_send(1))
                  product_matrix_meta_recv => &
                     buffers_win%right%meta_red3D(1:product_matrix_size_recv(1))
                  CALL mp_isendrecv(product_matrix_meta_send, proc3D_send, &
                                    product_matrix_meta_recv, proc3D_recv, &
                                    layers_3D_C_reduction%grp3D, &
                                    requests_reduction(1), requests_reduction(2))
                  DO myt = 1, nthreads
                     CALL dbcsr_data_ensure_size(layers_3D_C_reduction%data_red3D(myt), &
                                                 product_matrix_size_recv(myt + 1), &
                                                 nocopy=.TRUE.)
                     CALL dbcsr_data_set_pointer( &
                        area=data_send, &
                        rsize=product_matrix_size_send(myt + 1), &
                        csize=1, &
                        pointee=product_matrix3D%mats(irow3D_send, icol3D_send)%matrix%wms(myt)%data_area)
                     CALL dbcsr_data_set_pointer( &
                        area=data_get, &
                        rsize=product_matrix_size_recv(myt + 1), &
                        csize=1, &
                        pointee=layers_3D_C_reduction%data_red3D(myt))
                     CALL dbcsr_isendrecv_any(data_send, proc3D_send, &
                                              data_get, proc3D_recv, &
                                              layers_3D_C_reduction%grp3D, &
                                              requests_reduction(3 + (myt - 1)*2), &
                                              requests_reduction(4 + (myt - 1)*2))
                     CALL count_mpi_statistics(dbcsr_mpi_statistics%data_size(1, :), &
                                               product_matrix_size_send(myt + 1), &
                                               data_type_byte, &
                                               dbcsr_mpi_statistics%data_size_breakdown(:, :, 1))
                  END DO
!$OMP END MASTER
               END IF
!$OMP END PARALLEL
            END IF
            !
            IF (right_buffer_p%is_comm .AND. left_buffer_p%is_comm) THEN
               iright_buffer_calc = MIN(iright_buffer_calc, nbuffers_norms)
               ileft_buffer_calc = MIN(ileft_buffer_calc, nbuffers_norms)
               ! check if right matrix was already initialized
               IF (.NOT. right_buffer_p%matrix%valid) THEN
                  IF (use_acc()) CALL dbcsr_data_host2dev(right_buffer_p%data)
                  ! Repoint indices of matrices
                  CALL make_meta(right_buffer_p, &
                                 right_row_total_nimages, &
                                 right_buffer_p%vprow, &
                                 right_buffer_p%vpcol, &
                                 imgdist=imgdist_right, do_merge_rows=.FALSE., &
                                 global_indices=right_global_indices)
                  CALL ensure_array_size(k_sizes, ub=array_size(right_buffer_p%matrix%local_rows))
                  CALL local_filter(array_data(right_buffer_p%matrix%row_blk_size), &
                                    array_size(right_buffer_p%matrix%local_rows), &
                                    array_data(right_buffer_p%matrix%local_rows), &
                                    k_sizes)
                  IF (otf_filtering) THEN
                     CALL calculate_norms(right_buffer_p%matrix, &
                                          right_norms(:, iright_buffer_calc), &
                                          k_sizes, n_sizes(icol3D)%sizes)
                  END IF
                  IF (use_acc()) THEN
                     CALL acc_transpose_blocks(right_buffer_p%matrix, &
                                               right_buffer_p%trs_stackbuf, &
                                               k_sizes, n_sizes(icol3D)%sizes, &
                                               row_blk_sizes2enum, enum2row_blk_sizes, &
                                               col_blk_sizes2enum, enum2col_blk_sizes, &
                                               noresize=.TRUE.)
                  END IF
               END IF
               ! check if left matrix was already initialized
               IF (.NOT. left_buffer_p%matrix%valid) THEN
                  IF (use_acc()) CALL dbcsr_data_host2dev(left_buffer_p%data)
                  ! Repoint indices of matrices
                  CALL make_meta(left_buffer_p, &
                                 left_col_total_nimages, &
                                 left_buffer_p%vprow, &
                                 left_buffer_p%vpcol, &
                                 imgdist=imgdist_left, do_merge_rows=.TRUE., &
                                 global_indices=left_global_indices, &
                                 nthreads=nthreads)
                  IF (otf_filtering) THEN
                     CALL calculate_norms(left_buffer_p%matrix, &
                                          left_norms(:, ileft_buffer_calc), &
                                          m_sizes(irow3D)%sizes, k_sizes)
                  END IF
               END IF
               !  Wait for left and right buffers transfer to device before proceeding
               IF (use_acc()) THEN
                  CALL timeset(routineN//"_sync_h2d", handle2)
                  CALL acc_device_synchronize()
                  CALL timestop(handle2)
               END IF
               !
               CALL timeset(routineN//"_multrec", handle2)
!$OMP PARALLEL DEFAULT (NONE) &
!$OMP SHARED (left_buffer_p, ileft_buffer_calc, &
!$OMP         right_buffer_p, iright_buffer_calc, &
!$OMP         left_norms,right_norms, &
!$OMP         multrec, irow3D, icol3D, handle2, k_sizes) &
!$OMP PRIVATE (ithread) &
!$OMP REDUCTION (+: flop)
               ithread = 0
!$             ithread = omp_get_thread_num()
               CALL dbcsr_mm_multrec_multiply(multrec(ithread, irow3D, icol3D)%p, &
                                              left=left_buffer_p%matrix, &
                                              right=right_buffer_p%matrix, &
                                              flop=flop, &
                                              a_norms=left_norms(:, ileft_buffer_calc), &
                                              b_norms=right_norms(:, iright_buffer_calc), &
                                              k_sizes=k_sizes)
!$OMP END PARALLEL
               CALL timestop(handle2)
            END IF
            ! Reduce 3D layers and finalize the local layer
            IF (istep_k .GT. final_step_k) THEN
               ! Wait for the other 3D layers to reduce
               IF (istep_k .GT. final_step_k + 1) THEN
                  CALL timeset(routineN//"_red3D_data", handle2)
                  CALL mp_waitall(requests_reduction)
                  CALL timestop(handle2)
                  DO myt = 0, nthreads - 1
                     DEALLOCATE (multrec(myt, irow3D_send, icol3D_send)%p)
                     CALL dbcsr_work_destroy( &
                        product_matrix3D%mats(irow3D_send, icol3D_send)%matrix%wms(myt + 1))
                  END DO
                  DEALLOCATE (product_matrix3D%mats(irow3D_send, icol3D_send)%matrix%wms)
                  CALL dbcsr_release(product_matrix3D%mats(irow3D_send, icol3D_send)%matrix)
               END IF
               irow3D_send = irow3D
               icol3D_send = icol3D
               ! Store the initial shift for the recv node
               IF (istep_k .EQ. final_step_k + 1) THEN
                  shift3D_recv = shift3D - 4
               END IF
!$OMP PARALLEL DEFAULT (NONE) &
!$OMP SHARED (multrec, irow3D, icol3D, product_matrix3D, &
!$OMP         memtype_mpi_buffer, nthreads, myt, istep_k, &
!$OMP         irow3D_send, icol3D_send, myrow3D, mycol3D, &
!$OMP         last_step_k, proc3D_send, proc3D_recv, &
!$OMP         product_matrix_size_send, product_matrix_size_recv, &
!$OMP         nrows3D, ncols3D, shift3D_recv, myrank3D, &
!$OMP         layers_3D_C_reduction, requests_reduction_size, &
!$OMP         final_step_k, handle2, buffers_win, g2l_map_rows, g2l_map_cols) &
!$OMP PRIVATE (ithread) &
!$OMP REDUCTION (+: flop)
               ithread = 0
!$             ithread = omp_get_thread_num()
               !
               ! Evaluate the size of layers to send and set the buffers
               IF (irow3D .NE. myrow3D .OR. &
                   icol3D .NE. mycol3D) THEN
                  CALL dbcsr_mm_multrec_dev2host_init(multrec(ithread, irow3D, icol3D)%p)
!$OMP ATOMIC
                  product_matrix3D%mats(irow3D_send, icol3D_send)%matrix%nblks = &
                     product_matrix3D%mats(irow3D_send, icol3D_send)%matrix%nblks + &
                     dbcsr_mm_multrec_get_nblks(multrec(ithread, irow3D_send, icol3D_send)%p)
!$OMP BARRIER
!$OMP MASTER
                  ! First (nthreads+1) positions are reserved for
                  ! the offset sizes of each thread for meta
                  CALL ensure_array_size(buffers_win%left%meta_red3D, &
                                         ub=product_matrix3D%mats(irow3D_send, icol3D_send)% &
                                         matrix%nblks*3 + (nthreads + 1), &
                                         nocopy=.TRUE.)
                  ! Set the offsets
                  buffers_win%left%meta_red3D(1) = nthreads + 1
                  DO myt = 0, nthreads - 1
                     buffers_win%left%meta_red3D(myt + 2) = &
                        buffers_win%left%meta_red3D(myt + 1) + &
                        dbcsr_mm_multrec_get_nblks(multrec(myt, irow3D_send, icol3D_send)%p)*3
                     product_matrix_size_send(myt + 2) = &
                        dbcsr_mm_multrec_get_nze(multrec(myt, irow3D_send, icol3D_send)%p)
                  END DO
                  ! Send/recv data and meta sizes
                  product_matrix_size_send(1) = &
                     buffers_win%left%meta_red3D(nthreads + 1)
                  proc3D_send = (icol3D_send - 1)*nrows3D + irow3D_send - 1
                  !
                  CALL row_col_3D_reflected(irow3D, icol3D, nrows3D, ncols3D, shift3D_recv)
                  shift3D_recv = shift3D_recv - 1
                  proc3D_recv = (icol3D - 1)*nrows3D + irow3D - 1
                  CALL mp_isendrecv(product_matrix_size_send, proc3D_send, &
                                    product_matrix_size_recv, proc3D_recv, &
                                    layers_3D_C_reduction%grp3D, &
                                    requests_reduction_size(1), &
                                    requests_reduction_size(2))
!$OMP END MASTER
               ELSE
                  IF (istep_k .NE. last_step_k) &
                     DBCSR_ABORT("Last layer does not correspond to local layer")
               END IF
               ! Reduce to the local layer
               IF (istep_k .GT. final_step_k + 1) THEN
                  IF (dbcsr_data_get_size_referenced(layers_3D_C_reduction%data_red3D(ithread + 1)) .GT. 0) THEN
                     CALL timeset(routineN//"_red3D", handle2)
                     CALL dbcsr_mm_multrec_red3D(multrec(ithread, myrow3D, mycol3D)%p, &
                                                 buffers_win%right%meta_red3D, &
                                                 layers_3D_C_reduction%data_red3D(ithread + 1), &
                                                 flop, g2l_map_rows, g2l_map_cols)
                     CALL timestop(handle2)
                  END IF
               END IF
!$OMP END PARALLEL
            END IF
         END IF calc
         !
         ! Swap temporary buffers for the first tick
         IF (do_square_layers3D .AND. first_k .AND. &
             istep_k .LT. last_step_k) THEN
            iright_buffer_comm = right_buffers(iright_buffer_comm)%coord3D
            ileft_buffer_comm = left_buffers(ileft_buffer_comm)%coord3D
            CALL swap_buffers(right_buffers(iright_buffer_comm), right_buffers(nbuffers))
            CALL swap_buffers(left_buffers(ileft_buffer_comm), left_buffers(nbuffers))
         END IF
         !
         iright_buffer_calc = iright_buffer_comm
         ileft_buffer_calc = ileft_buffer_comm
      END DO grouped_steps_index
      !
      CALL timestop(handle1)
      !
      CALL m_memory(mem)
      max_memory = MAX(max_memory, REAL(mem))
      !
      IF (do_layers3D .AND. keep_sparsity) THEN
         DEALLOCATE (product_matrix_meta_size, product_matrix_meta_displ)
         DEALLOCATE (product_matrix_meta)
      END IF
      DEALLOCATE (right_norms, left_norms)
      DEALLOCATE (product_matrix_epss_size, product_matrix_epss_displ)
      IF (.NOT. otf_filtering .OR. (do_layers3D .AND. nrows3D .GT. 1)) THEN
         DEALLOCATE (product_matrix_epss)
      ELSE
         DEALLOCATE (row_max_epss)
      END IF
      !
      DEALLOCATE (left_images_size, right_images_size)
      NULLIFY (left_images_size, right_images_size)
      DEALLOCATE (left_images_size_layers3D, left_displ_layers3D)
      DEALLOCATE (right_images_size_layers3D, right_displ_layers3D)
      !
      ! Deallocate 3D layers
      IF (do_layers3D) THEN
         DEALLOCATE (product_matrix_size_send, product_matrix_size_recv)
         DEALLOCATE (requests_reduction)
         DO icol3D = 1, ncols3D
            DO irow3D = 1, nrows3D
               IF (irow3D .NE. myrow3D .OR. icol3D .NE. mycol3D) THEN
                  DEALLOCATE (product_matrix3D%mats(irow3D, icol3D)%matrix)
               END IF
            END DO
         END DO
         CALL dbcsr_data_clear_pointer(data_send)
         CALL dbcsr_data_release(data_send)
      END IF
      DEALLOCATE (product_matrix3D%mats)
      ! Finalize local layer
!$OMP PARALLEL DEFAULT (NONE) &
!$OMP SHARED (multrec, myrow3D, mycol3D) &
!$OMP PRIVATE (ithread)
      ithread = 0
!$    ithread = omp_get_thread_num()
      CALL dbcsr_mm_multrec_finalize(multrec(ithread, myrow3D, mycol3D)%p)
      DEALLOCATE (multrec(ithread, myrow3D, mycol3D)%p)
!$OMP END PARALLEL
      DEALLOCATE (multrec)
      DEALLOCATE (g2l_map_rows, g2l_map_cols)
      CALL dbcsr_finalize(product_matrix, reshuffle=PRESENT(filter_eps) .AND. .NOT. keep_sparsity)
      !
      DO irow3D = 1, nrows3D
         DEALLOCATE (m_sizes(irow3D)%sizes)
      END DO
      DEALLOCATE (m_sizes)
      DO icol3D = 1, ncols3D
         DEALLOCATE (n_sizes(icol3D)%sizes)
      END DO
      DEALLOCATE (n_sizes)
      IF (ASSOCIATED(k_sizes)) DEALLOCATE (k_sizes)
      !
      CALL dbcsr_data_clear_pointer(data_get)
      CALL dbcsr_data_release(data_get)
      !
      ! clean-up of communication buffers
      DO v_ki = 1, nbuffers
         CALL dbcsr_data_clear_pointer(left_buffers(v_ki)%data)
         IF (left_buffers(v_ki)%data%d%memory_type%acc_devalloc) THEN
            CALL acc_event_destroy(left_buffers(v_ki)%data%d%acc_ready)
         END IF
         CALL dbcsr_data_release(left_buffers(v_ki)%data)
         NULLIFY (left_buffers(v_ki)%matrix%index)
         CALL dbcsr_release(left_buffers(v_ki)%matrix)
         !
         CALL dbcsr_data_clear_pointer(right_buffers(v_ki)%data)
         IF (right_buffers(v_ki)%data%d%memory_type%acc_devalloc) THEN
            CALL acc_event_destroy(right_buffers(v_ki)%data%d%acc_ready)
         END IF
         CALL dbcsr_data_release(right_buffers(v_ki)%data)
         NULLIFY (right_buffers(v_ki)%matrix%index)
         CALL dbcsr_release(right_buffers(v_ki)%matrix)
         IF (use_acc()) THEN
            CALL dbcsr_data_clear_pointer(right_buffers(v_ki)%trs_stackbuf)
            IF (right_buffers(v_ki)%trs_stackbuf%d%memory_type%acc_devalloc) THEN
               CALL acc_event_destroy(right_buffers(v_ki)%trs_stackbuf%d%acc_ready)
            END IF
            CALL dbcsr_data_release(right_buffers(v_ki)%trs_stackbuf)
         END IF
      END DO
      DEALLOCATE (left_buffers, right_buffers)
      DEALLOCATE (do_comm_left, do_comm_right)
      DEALLOCATE (right_vcol, left_vrow)
      IF (use_acc()) THEN
         DEALLOCATE (row_blk_sizes2enum, enum2row_blk_sizes)
         DEALLOCATE (col_blk_sizes2enum, enum2col_blk_sizes)
      END IF
      !
      IF (debug_mod) THEN
         v_ki = 0
         DO blk = 1, SIZE(product_matrix%blk_p)
            v_ki = MAX(v_ki, ABS(product_matrix%blk_p(blk)))
         END DO
         WRITE (*, *) routineN//" Actual final size", &
            LOG(REAL(dbcsr_data_get_size(product_matrix%data_area)))/LOG(10.0), &
            LOG(REAL(v_ki))/LOG(10.0)
      END IF
      !
      CALL timestop(handle)
   END SUBROUTINE multiply_3D

   SUBROUTINE win_setup(buffer, do_win_create, request)
      TYPE(dbcsr_buffer), INTENT(INOUT)                  :: buffer
      LOGICAL, DIMENSION(:), INTENT(INOUT)               :: do_win_create
      TYPE(mp_request_type), INTENT(INOUT)               :: request

      CHARACTER(len=*), PARAMETER :: routineN = 'win_setup'
      INTEGER                                            :: handle, handle1, myproc

      CALL timeset(routineN, handle)

      IF (buffer%has_rma_win) THEN
         CALL timeset(routineN//"_win_check", handle1)
         CALL mp_wait(request)
         CALL timestop(handle1)
         IF (do_win_create(1)) THEN
            CALL mp_win_unlock_all(buffer%data_win)
            CALL mp_win_free(buffer%data_win)
         END IF
         IF (do_win_create(2)) THEN
            CALL mp_win_unlock_all(buffer%meta_win)
            CALL mp_win_free(buffer%meta_win)
         END IF
      END IF
      CALL dbcsr_data_release(buffer%data_before_resize)
      IF (ASSOCIATED(buffer%meta_before_resize)) THEN
         CALL memory_deallocate(buffer%meta_before_resize, memtype_mpi_buffer)
         NULLIFY (buffer%meta_before_resize)
      END IF
      !
      CALL mp_environ(taskid=myproc, groupid=buffer%subgrp)
      buffer%myproc = myproc
      IF (do_win_create(1)) THEN
         CALL dbcsr_win_create_any(buffer%data, buffer%subgrp, buffer%data_win)
         CALL mp_win_lock_all(buffer%data_win)
      END IF
      IF (do_win_create(2)) THEN
         CALL mp_win_create(buffer%meta, buffer%subgrp, buffer%meta_win)
         CALL mp_win_lock_all(buffer%meta_win)
      END IF
      !
      buffer%has_rma_win = .TRUE.
      CALL timestop(handle)
   END SUBROUTINE win_setup

   SUBROUTINE row_col_3D_reflected(irow3D, icol3D, nrows3D, ncols3D, shift3D)
      !! Apply reflected order, i.e. row increasing value for odd col value,
      !! row decreasing value for even col value

      INTEGER, INTENT(INOUT)                             :: irow3D, icol3D
      INTEGER, INTENT(IN)                                :: nrows3D, ncols3D, shift3D

      INTEGER                                            :: odd_or_even

      icol3D = MOD(shift3D/nrows3D, ncols3D) + 1
      irow3D = MOD(shift3D, nrows3D)
      odd_or_even = MOD(icol3D, 2)
      irow3D = (nrows3D - irow3D)*(1 - odd_or_even) + (irow3D + 1)*odd_or_even
   END SUBROUTINE row_col_3D_reflected

   SUBROUTINE setup_buffers(buffer_1, buffer_2, buffers, nbuffers, data_size, meta_size, matrix, imgdist)
      TYPE(dbcsr_buffer), INTENT(INOUT), TARGET          :: buffer_1, buffer_2
      TYPE(dbcsr_buffer), ALLOCATABLE, DIMENSION(:), &
         INTENT(INOUT)                                   :: buffers
      INTEGER, INTENT(IN)                                :: nbuffers, data_size, meta_size
      TYPE(dbcsr_type), INTENT(IN)                       :: matrix
      TYPE(dbcsr_imagedistribution_obj), INTENT(INOUT)   :: imgdist

      INTEGER                                            :: ibuffer, jbuffer
      INTEGER, DIMENSION(:), POINTER, CONTIGUOUS         :: meta_p
      LOGICAL                                            :: has_trs_stackbuf
      TYPE(dbcsr_buffer), POINTER                        :: buffer_p

      ALLOCATE (buffers(nbuffers))
      has_trs_stackbuf = dbcsr_data_valid(buffer_1%trs_stackbuf) .OR. dbcsr_data_valid(buffer_2%trs_stackbuf)
      DO ibuffer = 1, nbuffers
         jbuffer = (ibuffer - 1)/2
         IF (MOD(ibuffer, 2) .EQ. 1) THEN
            buffer_p => buffer_1
         ELSE
            buffer_p => buffer_2
         END IF
         ! Use slices for the 3D buffers
         CALL dbcsr_data_init(buffers(ibuffer)%data)
         CALL dbcsr_data_new(buffers(ibuffer)%data, dbcsr_data_get_type(buffer_p%data), &
                             memory_type=dbcsr_data_get_memory_type(buffer_p%data))
         IF (buffers(ibuffer)%data%d%memory_type%acc_devalloc) THEN
            CALL acc_event_create(buffers(ibuffer)%data%d%acc_ready)
         END IF
         CALL dbcsr_data_set_pointer( &
            area=buffers(ibuffer)%data, &
            rsize=data_size, &
            csize=1, &
            pointee=buffer_p%data, &
            source_lb=data_size*jbuffer + 1)
         ! Use meta_p pointer to avoid warning target-lifetime
         meta_p => buffer_p%meta(meta_size*jbuffer + 1: &
                                 meta_size*(jbuffer + 1))
         buffers(ibuffer)%meta => meta_p
         IF (has_trs_stackbuf) THEN
            CALL dbcsr_data_init(buffers(ibuffer)%trs_stackbuf)
            CALL dbcsr_data_new(buffers(ibuffer)%trs_stackbuf, dbcsr_data_get_type(buffer_p%trs_stackbuf), &
                                memory_type=dbcsr_data_get_memory_type(buffer_p%trs_stackbuf))
            IF (buffers(ibuffer)%trs_stackbuf%d%memory_type%acc_devalloc) THEN
               CALL acc_event_create(buffers(ibuffer)%trs_stackbuf%d%acc_ready)
            END IF
            CALL dbcsr_data_set_pointer( &
               area=buffers(ibuffer)%trs_stackbuf, &
               rsize=meta_size/3, &
               csize=1, &
               pointee=buffer_p%trs_stackbuf, &
               source_lb=(meta_size/3)*jbuffer + 1)
         END IF
         CALL setup_buffer_matrix_image(buffers(ibuffer)%matrix, imgdist, matrix, &
                                        buffers(ibuffer)%data, &
                                        buffers(ibuffer)%meta)
      END DO
   END SUBROUTINE setup_buffers

   SUBROUTINE setup_buffer_matrix_image(matrix, imgdist, &
                                        template_matrix, data_buffer, &
                                        meta_buffer)
      TYPE(dbcsr_type), INTENT(INOUT)                    :: matrix
      TYPE(dbcsr_imagedistribution_obj), INTENT(INOUT)   :: imgdist
      TYPE(dbcsr_type), INTENT(IN)                       :: template_matrix
      TYPE(dbcsr_data_obj), INTENT(INOUT)                :: data_buffer
      INTEGER, DIMENSION(:), INTENT(IN), TARGET, CONTIGUOUS :: meta_buffer

      matrix = dbcsr_type()
      CALL dbcsr_create(matrix, &
                        "Buffer image of "//template_matrix%name, &
                        imgdist%i%main, &
                        dbcsr_type_no_symmetry, &
                        row_blk_size_obj=template_matrix%row_blk_size, &
                        col_blk_size_obj=template_matrix%col_blk_size, &
                        data_type=dbcsr_data_get_type(data_buffer), &
                        data_buffer=data_buffer, &
                        max_rbs=template_matrix%max_rbs, max_cbs=template_matrix%max_cbs, &
                        row_blk_offset=template_matrix%row_blk_offset, &
                        col_blk_offset=template_matrix%col_blk_offset, &
                        index_memory_type=memtype_mpi_buffer, &
                        make_index=.FALSE.)
      matrix%index => meta_buffer
      matrix%negate_real = template_matrix%negate_real
      matrix%negate_imaginary = template_matrix%negate_imaginary
      matrix%local_indexing = .TRUE.
      matrix%list_indexing = .TRUE.
   END SUBROUTINE setup_buffer_matrix_image

   SUBROUTINE swap_buffers(buffers_1, buffers_2)
      TYPE(dbcsr_buffer), INTENT(INOUT)                  :: buffers_1, buffers_2

      TYPE(dbcsr_buffer)                                 :: tmp

      tmp = buffers_1
      buffers_1 = buffers_2
      buffers_2 = tmp
   END SUBROUTINE swap_buffers

   SUBROUTINE rma_transfer(recv_vproc, nimages, &
                           size_layers3D, displ_layers3D, &
                           buffer, &
                           meta_win, data_win, &
                           data_get, data_type_byte, &
                           buffer_win, layer3D, nlayers3D)
      INTEGER, INTENT(IN)                                :: recv_vproc, nimages
      INTEGER, DIMENSION(:), INTENT(IN)                  :: size_layers3D, displ_layers3D
      TYPE(dbcsr_buffer), INTENT(INOUT)                  :: buffer
      TYPE(mp_win_type), INTENT(IN)                      :: meta_win, data_win
      TYPE(dbcsr_data_obj), INTENT(INOUT)                :: data_get
      INTEGER, INTENT(IN)                                :: data_type_byte
      TYPE(dbcsr_buffer), INTENT(IN)                     :: buffer_win
      INTEGER, INTENT(IN)                                :: layer3D, nlayers3D

      INTEGER                                            :: recv_proc
      INTEGER, DIMENSION(:), POINTER, CONTIGUOUS         :: meta_get

      buffer%is_comm = .TRUE.
      buffer%get_requests(:) = mp_request_null
      recv_proc = (recv_vproc/nimages)*nlayers3D + layer3D - 1
      !
      meta_get => buffer%meta(dbcsr_num_slots + 1:dbcsr_num_slots + size_layers3D(imeta))
      buffer%meta_size = size_layers3D(imeta)
      CALL mp_rget(meta_get, recv_proc, &
                   meta_win, &
                   buffer_win%meta, &
                   buffer_win%myproc, &
                   disp=displ_layers3D(imeta), &
                   request=buffer%get_requests(1))
      CALL dbcsr_data_set_pointer( &
         area=data_get, &
         rsize=size_layers3D(idata), &
         csize=1, &
         pointee=buffer%data, &
         source_lb=1)
      CALL dbcsr_rget_any(data_get, recv_proc, &
                          data_win, &
                          buffer_win%data, &
                          buffer_win%myproc, &
                          disp=displ_layers3D(idata), &
                          request=buffer%get_requests(2))
      CALL count_mpi_statistics(dbcsr_mpi_statistics%data_size(1, :), &
                                size_layers3D(idata), &
                                data_type_byte, &
                                dbcsr_mpi_statistics%data_size_breakdown(:, :, 1))
      dbcsr_mpi_statistics%nexchanged = dbcsr_mpi_statistics%nexchanged + 1
      !
      ! Set the referenced sizes to the actual data moved via MPI
      CALL dbcsr_data_set_size_referenced(buffer%data, size_layers3D(idata))
      buffer%matrix%valid = .FALSE.
   END SUBROUTINE rma_transfer

   SUBROUTINE setup_rec_index_images(meta_buffer, img_nblks_rows, img_nblks_cols, &
                                     refs_size, refs_displ, size_index, has_threads)
      INTEGER, DIMENSION(:), INTENT(INOUT)               :: meta_buffer
      INTEGER, DIMENSION(:), INTENT(IN)                  :: img_nblks_rows, img_nblks_cols, &
                                                            refs_size, refs_displ
      INTEGER, INTENT(IN)                                :: size_index
      LOGICAL, INTENT(IN)                                :: has_threads

      CHARACTER(len=*), PARAMETER :: routineN = 'setup_rec_index_images'

      INTEGER                                            :: handle, in, nblkcols_local, &
                                                            nblkrows_local, t_f, t_l, t_size

!$    INTEGER                           :: ithread

      CALL timeset(routineN, handle)
      IF (has_threads) THEN
         nblkrows_local = img_nblks_rows(1)
      ELSE
         nblkcols_local = img_nblks_cols(1)
      END IF
      !
      DO in = 1, SIZE(refs_size)
         IF (refs_size(in) .EQ. 0) CYCLE
         ! Number of blocks
         t_size = (refs_size(in) - size_index)/3
         IF (has_threads) THEN
            nblkcols_local = img_nblks_cols(in)
         ELSE
            nblkrows_local = img_nblks_rows(in)
         END IF
         t_f = 1
         t_l = t_size
!$OMP    PARALLEL IF (has_threads) DEFAULT (NONE) &
!$OMP    PRIVATE (ithread) &
!$OMP    FIRSTPRIVATE (t_f, t_l, t_size) &
!$OMP    SHARED (meta_buffer, in, has_threads, refs_displ, &
!$OMP            size_index, nblkrows_local, nblkcols_local)
!$       ithread = OMP_GET_THREAD_NUM() + 1
!$       IF (has_threads) THEN
!$          t_f = meta_buffer(refs_displ(in) + ithread) + 1
!$          t_l = meta_buffer(refs_displ(in) + ithread + 1)
!$       END IF
         t_size = t_l - t_f + 1
         IF (t_size .GT. 0) THEN
            CALL rec_sort_index(1, nblkrows_local, &
                                1, nblkcols_local, &
                                t_size, &
                                meta_buffer(refs_displ(in) + size_index + t_f*3 - 2: &
                                            refs_displ(in) + size_index + t_l*3), &
                                0)
         END IF
!$OMP    END PARALLEL
      END DO
      CALL timestop(handle)
   END SUBROUTINE setup_rec_index_images

   SUBROUTINE buffer_init(buffer, data_type, &
      !! Init buffer
                          data_size, meta_size, &
                          num_data, &
                          data_memory_type, trs_memory_type)
      TYPE(dbcsr_buffer), INTENT(INOUT)                  :: buffer
      INTEGER, INTENT(IN)                                :: data_type, data_size, meta_size
      INTEGER, INTENT(IN), OPTIONAL                      :: num_data
      TYPE(dbcsr_memtype_type), INTENT(IN)               :: data_memory_type
      TYPE(dbcsr_memtype_type), INTENT(IN), OPTIONAL     :: trs_memory_type

      INTEGER                                            :: my_num_data
      LOGICAL                                            :: new_trs_stackbuf

      my_num_data = 1
      IF (PRESENT(num_data)) THEN
         my_num_data = num_data
      ELSE
         IF (dbcsr_data_valid(buffer%data_before_resize) .OR. ASSOCIATED(buffer%meta_before_resize)) &
            DBCSR_ABORT("Previous data area already initialized.")
         CALL dbcsr_data_init(buffer%data_before_resize)
         CALL dbcsr_data_new(buffer%data_before_resize, data_type, memory_type=data_memory_type)
      END IF
      new_trs_stackbuf = PRESENT(trs_memory_type) .AND. use_acc()
      !
      IF (buffer%is_valid) THEN
         ! Invalid buffers if data_type is different
         IF (dbcsr_data_get_type(buffer%data) .NE. data_type) THEN
            CALL dbcsr_data_release(buffer%data)
            IF (new_trs_stackbuf) THEN
               CALL dbcsr_data_release(buffer%trs_stackbuf)
            END IF
            buffer%is_valid = .FALSE.
         END IF
      END IF
      !
      IF (.NOT. buffer%is_valid) THEN
         ! First initialization
         CALL dbcsr_data_init(buffer%data)
         CALL dbcsr_data_new(buffer%data, data_type=data_type, &
                             data_size=data_size*my_num_data, memory_type=data_memory_type)
         CALL dbcsr_data_set_size_referenced(buffer%data, data_size*my_num_data)
         IF (new_trs_stackbuf) THEN
            CALL dbcsr_data_init(buffer%trs_stackbuf)
            CALL dbcsr_data_new(buffer%trs_stackbuf, &
                                data_type=dbcsr_type_int_4, data_size=(meta_size/3)*my_num_data, &
                                memory_type=trs_memory_type)
         END IF
         buffer%is_valid = .TRUE.
      ELSE
         IF (PRESENT(num_data)) THEN
            CALL dbcsr_data_ensure_size(buffer%data, data_size*my_num_data, nocopy=.TRUE.)
            IF (new_trs_stackbuf) THEN
               CALL dbcsr_data_ensure_size(buffer%trs_stackbuf, (meta_size/3)*my_num_data, nocopy=.TRUE.)
            END IF
         ELSE
            ! Case for MPI windows
            ! data_before_resize keeps the pointer to previous data in the case of reallocation
            CALL dbcsr_data_ensure_size(buffer%data, data_size, nocopy=.TRUE., &
                                        area_resize=buffer%data_before_resize)
         END IF
      END IF
      !
      IF (PRESENT(num_data)) THEN
         CALL ensure_array_size(buffer%meta, ub=meta_size*my_num_data, nocopy=.TRUE., &
                                memory_type=memtype_mpi_buffer)
      ELSE
         ! Case for MPI windows
         ! meta_before_resize keeps the pointer to previous meta in the case of reallocation
         CALL ensure_array_size(buffer%meta, array_resize=buffer%meta_before_resize, &
                                ub=meta_size, nocopy=.TRUE., &
                                memory_type=memtype_mpi_buffer)
      END IF
      !
      buffer%is_comm = .FALSE.
   END SUBROUTINE buffer_init

   SUBROUTINE buffers_release()
      !! Release all buffers
      IF (request_sync_mult .NE. mp_request_null) CALL mp_wait(request_sync_mult)
      request_sync_mult = mp_request_null
      CALL buffer_release(buffers_1%right)
      CALL buffer_release(buffers_1%left)
      CALL buffer_release(buffers_2%right)
      CALL buffer_release(buffers_2%left)
      CALL buffer_release(buffers_win%right)
      CALL buffer_release(buffers_win%left)
      !
      IF (dbcsr_data_valid(make_buffers_data_send)) CALL dbcsr_data_release(make_buffers_data_send)
      IF (dbcsr_data_valid(make_buffers_data_recv)) CALL dbcsr_data_release(make_buffers_data_recv)
      IF (ASSOCIATED(make_buffers_meta_send)) CALL memory_deallocate(make_buffers_meta_send, memtype_mpi_buffer)
      IF (ASSOCIATED(make_buffers_meta_recv)) CALL memory_deallocate(make_buffers_meta_recv, memtype_mpi_buffer)
   END SUBROUTINE buffers_release

   SUBROUTINE buffer_release(buffer)
      !! Release buffer
      TYPE(dbcsr_buffer), INTENT(INOUT)                  :: buffer

      IF (buffer%has_rma_win) THEN
         CALL mp_win_unlock_all(buffer%data_win)
         CALL mp_win_free(buffer%data_win)
         CALL mp_win_unlock_all(buffer%meta_win)
         CALL mp_win_free(buffer%meta_win)
         buffer%has_rma_win = .FALSE.
         buffer%grp = mp_comm_null
         IF (buffer%subgrp .NE. mp_comm_null .AND. buffer%num_layers_3D .GT. 1) &
            CALL mp_comm_free(buffer%subgrp)
         buffer%subgrp = mp_comm_null
         buffer%num_layers_3D = 1
      END IF
      !
      IF (buffer%is_valid) THEN
         CALL dbcsr_data_release(buffer%data)
         IF (dbcsr_data_valid(buffer%trs_stackbuf)) THEN
            CALL dbcsr_data_release(buffer%trs_stackbuf)
         END IF
         IF (dbcsr_data_valid(buffer%data_before_resize)) THEN
            CALL dbcsr_data_release(buffer%data_before_resize)
         END IF
         buffer%is_valid = .FALSE.
      END IF
      IF (ASSOCIATED(buffer%meta)) THEN
         CALL memory_deallocate(buffer%meta, memtype_mpi_buffer)
         NULLIFY (buffer%meta)
      END IF
      IF (ASSOCIATED(buffer%meta_before_resize)) THEN
         CALL memory_deallocate(buffer%meta_before_resize, memtype_mpi_buffer)
         NULLIFY (buffer%meta_before_resize)
      END IF
      IF (ASSOCIATED(buffer%meta_red3D)) THEN
         CALL memory_deallocate(buffer%meta_red3D, memtype_mpi_buffer)
         NULLIFY (buffer%meta_red3D)
      END IF
   END SUBROUTINE buffer_release

   SUBROUTINE make_meta(buffer, ntotal_images, &
      !! Create meta indices
                        vprow, vpcol, &
                        imgdist, do_merge_rows, &
                        global_indices, &
                        nthreads)
      TYPE(dbcsr_buffer), INTENT(INOUT)                  :: buffer
      INTEGER, INTENT(IN)                                :: ntotal_images, vprow, vpcol
      TYPE(dbcsr_imagedistribution_obj), INTENT(INOUT)   :: imgdist
      LOGICAL, INTENT(IN)                                :: do_merge_rows
      INTEGER, DIMENSION(:), INTENT(IN)                  :: global_indices
      INTEGER, INTENT(IN), OPTIONAL                      :: nthreads

      INTEGER                                            :: size_index

      buffer%matrix%index(dbcsr_slot_size) = &
         buffer%meta_size + dbcsr_num_slots
      size_index = 0
      IF (PRESENT(nthreads)) THEN
!$       size_index = nthreads + 1
      END IF
      buffer%matrix%index(dbcsr_slot_nblks) = &
         (buffer%meta_size - size_index)/3
      buffer%matrix%index(dbcsr_slot_nze) = &
         dbcsr_data_get_size_referenced(buffer%data)
      buffer%matrix%index(dbcsr_slot_dense) = 0
      buffer%matrix%index(dbcsr_slot_nblkrows_total:dbcsr_slot_nfullcols_local) = &
         global_indices(:)
      buffer%matrix%index(dbcsr_slot_type:dbcsr_num_slots) = 0
      ! Virtual coords
      IF (do_merge_rows) THEN
         buffer%matrix%index(dbcsr_slot_home_vprow) = vprow
         buffer%matrix%index(dbcsr_slot_home_vpcol) = MOD(vpcol, ntotal_images)
      ELSE
         buffer%matrix%index(dbcsr_slot_home_vprow) = MOD(vprow, ntotal_images)
         buffer%matrix%index(dbcsr_slot_home_vpcol) = vpcol
      END IF
      buffer%matrix%index(dbcsr_slot_row_p) = 1
      buffer%matrix%index(dbcsr_slot_col_i) = 1
      buffer%matrix%index(dbcsr_slot_blk_p) = 1
      ! thr_c
      size_index = dbcsr_num_slots
!$    IF (PRESENT(nthreads)) THEN
!$       size_index = size_index + nthreads + 1
!$       buffer%matrix%index(dbcsr_slot_thr_c) = dbcsr_num_slots + 1
!$       buffer%matrix%index(dbcsr_slot_thr_c + 1) = size_index
!$    END IF
      buffer%matrix%index(dbcsr_slot_coo_l) = size_index + 1
      buffer%matrix%index(dbcsr_num_slots) = buffer%matrix%index(dbcsr_slot_size)
      !
      ! Reset
      CALL dbcsr_reset_vlocals(buffer%matrix, imgdist)
      !
      ! Repoint index
      buffer%matrix%nblks = 0
      buffer%matrix%nze = 0
      CALL dbcsr_repoint_index(buffer%matrix)
      buffer%matrix%valid = .TRUE.
   END SUBROUTINE make_meta

   SUBROUTINE remap_layers3D(refs_size, refs_size_layers3D, refs_displ_layers3D, &
      !! Remap the 4-rank array in a 3-rank array by introducing the virtual coordinate
                             data_size, meta_size)
      INTEGER, DIMENSION(:, :, :, :), INTENT(IN), &
         CONTIGUOUS, POINTER                             :: refs_size
      INTEGER, DIMENSION(:, :, :), INTENT(OUT), &
         CONTIGUOUS, POINTER                             :: refs_size_layers3D, refs_displ_layers3D
      INTEGER, INTENT(OUT)                               :: data_size, meta_size

      INTEGER                                            :: ilayer, image, iproc, nimages, &
                                                            nlayers3D, nprocs

      nimages = SIZE(refs_size, 2)
      nlayers3D = SIZE(refs_size, 3)
      nprocs = SIZE(refs_size, 4)
      !
      ALLOCATE (refs_size_layers3D(idata:imeta, nlayers3D, 0:nimages*nprocs - 1))
      ALLOCATE (refs_displ_layers3D(idata:imeta, nlayers3D, 0:nimages*nprocs - 1))
      data_size = 0; meta_size = 0
      !
!$OMP PARALLEL DO DEFAULT (NONE) &
!$OMP SHARED (nprocs, nimages, nlayers3D, &
!$OMP         refs_size_layers3D, refs_displ_layers3D, refs_size) &
!$OMP PRIVATE (iproc,image,ilayer) &
!$OMP REDUCTION (MAX : data_size, meta_size)
      DO iproc = 0, nprocs - 1
         DO ilayer = 1, nlayers3D
            DO image = 1, nimages
               refs_size_layers3D(:, ilayer, image + iproc*nimages - 1) = refs_size(:, image, ilayer, iproc)
               data_size = MAX(data_size, refs_size(idata, image, ilayer, iproc))
               meta_size = MAX(meta_size, refs_size(imeta, image, ilayer, iproc))
            END DO
            refs_displ_layers3D(:, ilayer, iproc*nimages) = 0
            DO image = 1, nimages - 1
               refs_displ_layers3D(:, ilayer, image + iproc*nimages) = &
                  refs_displ_layers3D(:, ilayer, image + iproc*nimages - 1) + refs_size(:, image, ilayer, iproc)
            END DO
         END DO
      END DO
!$OMP END PARALLEL DO
   END SUBROUTINE remap_layers3D

   PURE FUNCTION get_max_layers_3D()
      INTEGER                                            :: get_max_layers_3D

      get_max_layers_3D = layers_3D_C_reduction%max_num_layers_3D

   END FUNCTION get_max_layers_3D

END MODULE dbcsr_mm_3d