make_buffers Subroutine

private subroutine make_buffers(matrix, imgdist, is_left, f_row, l_row, f_col, l_col, otf_filtering, scale_value)

Prepare orig images for MPI windows

Arguments

Type IntentOptional Attributes Name
type(dbcsr_type), intent(in) :: matrix
type(dbcsr_imagedistribution_obj), intent(in) :: imgdist
logical, intent(in) :: is_left
integer, intent(in) :: f_row
integer, intent(in) :: l_row
integer, intent(in) :: f_col
integer, intent(in) :: l_col
logical, intent(in) :: otf_filtering
type(dbcsr_scalar_type), intent(in), optional :: scale_value

Source Code

   SUBROUTINE make_buffers(matrix, imgdist, is_left, &
      !! Prepare orig images for MPI windows
                           f_row, l_row, f_col, l_col, &
                           otf_filtering, scale_value)
      TYPE(dbcsr_type), INTENT(IN)                       :: matrix
      TYPE(dbcsr_imagedistribution_obj), INTENT(IN)      :: imgdist
      LOGICAL, INTENT(IN)                                :: is_left
      INTEGER, INTENT(IN)                                :: f_row, l_row, f_col, l_col
      LOGICAL, INTENT(IN)                                :: otf_filtering
      TYPE(dbcsr_scalar_type), INTENT(IN), OPTIONAL      :: scale_value

      CHARACTER(len=*), PARAMETER :: routineN = 'make_buffers'
      INTEGER :: blk, blk_p, bp, col, col_img, col_size, data_type, dst_proc, f_col_f, f_row_f, &
                 handle, handle2, irequests, it, ithread, l_col_l, l_row_l, mynode, myt, &
                 nblkcols_local, nblkrows_local, ncols_images, nimages, nprocs, nprocs_total, &
                 nrows_images, nsymmetries, nthreads, nze, pcol, prow, row, row_img, row_size, size_index, &
                 stored_col, stored_row, symmetry_i, tr_col_size, tr_row_size
      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: img_nblks_cols, img_nblks_rows
      INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: local_images_displ, recv_displ_proc, &
                                                            recv_size_proc, send_displ_proc, &
                                                            send_size_proc
      INTEGER, ALLOCATABLE, DIMENSION(:, :, :)           :: offset_data, offset_threads
      INTEGER, ALLOCATABLE, DIMENSION(:, :, :, :)        :: recv_displs, recv_sizes, send_displs, &
                                                            send_sizes
      INTEGER, DIMENSION(2)                              :: block_col_bounds, block_row_bounds
      INTEGER, DIMENSION(:), POINTER, CONTIGUOUS  :: col_dist, col_img_dist, local_cols, local_g2l_map_cols, &
                                                     local_g2l_map_rows, local_rows, meta_buffer_p, &
                                                     row_dist, row_img_dist, threads_dist
      INTEGER, DIMENSION(:, :), POINTER, CONTIGUOUS      :: blacs2mpi, local_images_size
      INTEGER, DIMENSION(idata:imeta)                    :: my_size_recv, my_size_send
      INTEGER, POINTER                                   :: coli, rowi
      INTEGER, TARGET                                    :: mi, ui
      LOGICAL :: do_crop, do_part_crop_col, do_part_crop_f_col, do_part_crop_f_row, &
                 do_part_crop_l_col, do_part_crop_l_row, do_part_crop_row, do_symmetry, tr
      LOGICAL, DIMENSION(:), POINTER, CONTIGUOUS         :: do_win_create
      TYPE(dbcsr_buffer), POINTER                        :: buffer
      TYPE(dbcsr_data_obj)                               :: data_block
      TYPE(dbcsr_data_obj), POINTER                      :: data_buffer_p
      TYPE(dbcsr_distribution_obj)                       :: set_dist
      TYPE(dbcsr_iterator)                               :: iter
      TYPE(dbcsr_mp_obj)                                 :: mp_obj
      TYPE(dbcsr_scalar_type)                            :: scale_neg_one
      TYPE(dbcsr_type)                                   :: sm
      TYPE(mp_comm_type)                                 :: grp

!$    INTEGER(kind=omp_lock_kind), ALLOCATABLE, DIMENSION(:) :: locks

      CALL timeset(routineN, handle)
      ! Sync with previous multiplication
      IF (request_sync_mult .NE. mp_request_null) &
         DBCSR_ABORT("Multiplications are not in sync!")
      !
      ! Take input values and check validity
      IF (.NOT. dbcsr_valid_index(matrix)) &
         DBCSR_ABORT("Matrix not initialized.")
      sm = matrix
      data_type = sm%data_type
      IF (data_type .NE. dbcsr_type_real_8 .AND. &
          data_type .NE. dbcsr_type_real_4 .AND. &
          data_type .NE. dbcsr_type_complex_8 .AND. &
          data_type .NE. dbcsr_type_complex_4) &
         DBCSR_ABORT("Invalid data type.")
      scale_neg_one = dbcsr_scalar_negative(dbcsr_scalar_one(data_type))
      set_dist = imgdist%i%main
      row_dist => dbcsr_distribution_row_dist(set_dist)
      col_dist => dbcsr_distribution_col_dist(set_dist)
      local_rows => dbcsr_distribution_local_rows(set_dist)
      local_cols => dbcsr_distribution_local_cols(set_dist)
      nblkrows_local = SIZE(local_rows)
      nblkcols_local = SIZE(local_cols)
      IF (sm%symmetry) THEN
         IF (SIZE(row_dist) .NE. SIZE(col_dist)) &
            DBCSR_WARN('Unequal row and column distributions for symmetric matrix.')
      END IF
      nrows_images = imgdist%i%row_decimation
      row_img_dist => array_data(imgdist%i%row_image)
      ncols_images = imgdist%i%col_decimation
      col_img_dist => array_data(imgdist%i%col_image)
      mp_obj = dbcsr_distribution_mp(imgdist%i%main)
      CALL dbcsr_mp_grid_setup(mp_obj)
      nprocs_total = dbcsr_mp_numnodes(mp_obj)
      mynode = dbcsr_mp_mynode(mp_obj)
      grp = dbcsr_mp_group(mp_obj)
      blacs2mpi => dbcsr_mp_pgrid(mp_obj)
      IF (dbcsr_distribution_max_row_dist(set_dist) .GT. UBOUND(blacs2mpi, 1)) &
         DBCSR_ABORT("Row distribution references unexistent processor rows")
      IF (dbcsr_distribution_max_col_dist(set_dist) .GT. UBOUND(blacs2mpi, 2)) &
         DBCSR_ABORT("Col distribution references unexistent processor cols")
      ! Check threads configuration
      NULLIFY (threads_dist)
!$    IF (.NOT. dbcsr_distribution_has_threads(dbcsr_distribution(matrix))) &
!$       DBCSR_ABORT("Thread distribution not defined")
!$    threads_dist => array_data(dbcsr_distribution_thread_dist(dbcsr_distribution(matrix)))
      IF (is_left) THEN
         IF (nrows_images .GT. 1) &
            DBCSR_ABORT("Row nimages for left matrix is not 1!")
      ELSE
         IF (ncols_images .GT. 1) &
            DBCSR_ABORT("Col nimages for right matrix is not 1!")
      END IF
      !
      ! Crop matrix
      do_crop = .FALSE.
      do_part_crop_row = .FALSE.
      do_part_crop_col = .FALSE.
      do_part_crop_f_row = .FALSE.
      do_part_crop_l_row = .FALSE.
      do_part_crop_f_col = .FALSE.
      do_part_crop_l_col = .FALSE.
      ! Set no limits
      IF (ANY((/f_row, l_row, f_col, l_col/) .NE. 0)) THEN
         IF (f_row .LT. 0) &
            DBCSR_ABORT("Invalid first row bound.")
         IF (l_row .GT. dbcsr_nfullrows_total(matrix)) &
            DBCSR_ABORT("Invalid last row bound.")
         IF (f_col .LT. 0) &
            DBCSR_ABORT("Invalid first column bound.")
         IF (l_col .GT. dbcsr_nfullcols_total(matrix)) &
            DBCSR_ABORT("Invalid last column bound.")
         !
         do_crop = .TRUE.
         !
         ! Convert bounds to block addressing
         IF (f_row .EQ. 0) THEN
            block_row_bounds(1) = 1
         ELSE
            CALL find_block_of_element(f_row, block_row_bounds(1), &
                                       dbcsr_nblkrows_total(matrix), &
                                       dbcsr_row_block_offsets(matrix), &
                                       hint=0)
            do_part_crop_f_row = array_get(dbcsr_row_block_offsets(matrix), block_row_bounds(1)) .NE. f_row
            IF (do_part_crop_f_row) THEN
               ! Block offset of last cleared row
               f_row_f = f_row - array_get(dbcsr_row_block_offsets(matrix), block_row_bounds(1))
            END IF
         END IF
         !
         IF (l_row .EQ. 0) THEN
            block_row_bounds(2) = dbcsr_nblkrows_total(matrix)
         ELSE
            CALL find_block_of_element(l_row, block_row_bounds(2), &
                                       dbcsr_nblkrows_total(matrix), &
                                       dbcsr_row_block_offsets(matrix), &
                                       hint=0)
            do_part_crop_l_row = (array_get(dbcsr_row_block_offsets(matrix), block_row_bounds(2) + 1) - 1) .NE. l_row
            IF (do_part_crop_l_row) THEN
               ! Block offset of first cleared row
               l_row_l = 2 + l_row - array_get(dbcsr_row_block_offsets(matrix), block_row_bounds(2))
            END IF
         END IF
         do_part_crop_row = do_part_crop_f_row .OR. do_part_crop_l_row
         !
         IF (f_col .EQ. 0) THEN
            block_col_bounds(1) = 1
         ELSE
            CALL find_block_of_element(f_col, block_col_bounds(1), &
                                       dbcsr_nblkcols_total(matrix), &
                                       dbcsr_col_block_offsets(matrix), &
                                       hint=0)
            do_part_crop_f_col = array_get(dbcsr_col_block_offsets(matrix), block_col_bounds(1)) .NE. f_col
            IF (do_part_crop_f_col) THEN
               ! Block offset of last cleared col
               f_col_f = f_col - array_get(dbcsr_col_block_offsets(matrix), block_col_bounds(1))
            END IF
         END IF
         !
         IF (l_col .EQ. 0) THEN
            block_col_bounds(2) = dbcsr_nblkcols_total(matrix)
         ELSE
            CALL find_block_of_element(l_col, block_col_bounds(2), &
                                       dbcsr_nblkcols_total(matrix), &
                                       dbcsr_col_block_offsets(matrix), &
                                       hint=0)
            do_part_crop_l_col = (array_get(dbcsr_col_block_offsets(matrix), block_col_bounds(2) + 1) - 1) .NE. l_col
            IF (do_part_crop_l_col) THEN
               ! Block offset of first cleared col
               l_col_l = 2 + l_col - array_get(dbcsr_col_block_offsets(matrix), block_col_bounds(2))
            END IF
         END IF
         do_part_crop_col = do_part_crop_f_col .OR. do_part_crop_l_col
      END IF
      !
      IF (dbcsr_has_symmetry(matrix)) THEN
         nsymmetries = 2
         do_symmetry = .TRUE.
      ELSE
         nsymmetries = 1
         do_symmetry = .FALSE.
      END IF
      !
      IF (is_left) THEN
         nimages = ncols_images
         buffer => buffers_win%left
         nprocs = dbcsr_mp_npcols(mp_obj)
         ALLOCATE (left_images_size(idata:imeta, &
                                    nimages, &
                                    MAX(1, dbcsr_mp_nprows(mp_obj)/layers_3D_C_reduction%side3D), &
                                    0:nprocs - 1))
         ALLOCATE (left_local_images_size(idata:imeta, nimages))
         local_images_size => left_local_images_size
         irequests = 1
         !
         ! Count the maximum possible multiplies per row for on-the-fly filtering
         IF (otf_filtering) THEN
            ALLOCATE (left_total_row_counts(nblkrows_local))
            left_total_row_counts = 0
         END IF
         do_win_create => do_win_create_left
      ELSE
         nimages = nrows_images
         buffer => buffers_win%right
         nprocs = dbcsr_mp_nprows(mp_obj)
         ALLOCATE (right_images_size(idata:imeta, &
                                     nimages, &
                                     MAX(1, dbcsr_mp_npcols(mp_obj)/layers_3D_C_reduction%side3D), &
                                     0:nprocs - 1))
         ALLOCATE (right_local_images_size(idata:imeta, nimages))
         local_images_size => right_local_images_size
         irequests = 2
         do_win_create => do_win_create_right
      END IF
      !
      ! 3D communicator
      CALL make_layers_3D_AB(layers_3D_C_reduction%num_layers_3D, &
                             layers_3D_C_reduction%side3D, &
                             mp_obj, is_left, buffer)
      !
      ! Evaluate maps for global -> local indexing (g2l_map_rows, g2l_map_cols)
      ! Count the number of blocks per row/column (img_nblks_rows, img_nblks_cols)
      IF (is_left) THEN
         ALLOCATE (g2l_map_rows(sm%nblkrows_total))
         local_g2l_map_rows => g2l_map_rows
         ALLOCATE (local_g2l_map_cols(sm%nblkcols_total))
         ALLOCATE (img_nblks_rows(1), img_nblks_cols(nimages))
      ELSE
         ALLOCATE (g2l_map_cols(sm%nblkcols_total))
         local_g2l_map_cols => g2l_map_cols
         ALLOCATE (local_g2l_map_rows(sm%nblkrows_total))
         ALLOCATE (img_nblks_rows(nimages), img_nblks_cols(1))
      END IF
      !
      local_g2l_map_rows(:) = 0
      IF (nrows_images .EQ. 1) THEN
         img_nblks_rows(1) = nblkrows_local
         DO row = 1, nblkrows_local
            local_g2l_map_rows(local_rows(row)) = row
         END DO
      ELSE
         img_nblks_rows(:) = 0
         DO row = 1, nblkrows_local
            row_img = row_img_dist(local_rows(row))
            ui = MOD(row_img - 1, nrows_images) + 1
            img_nblks_rows(ui) = img_nblks_rows(ui) + 1
            local_g2l_map_rows(local_rows(row)) = img_nblks_rows(ui)
         END DO
      END IF
      !
      local_g2l_map_cols(:) = 0
      IF (ncols_images .EQ. 1) THEN
         img_nblks_cols(1) = nblkcols_local
         DO col = 1, nblkcols_local
            local_g2l_map_cols(local_cols(col)) = col
         END DO
      ELSE
         img_nblks_cols(:) = 0
         DO col = 1, nblkcols_local
            col_img = col_img_dist(local_cols(col))
            ui = MOD(col_img - 1, ncols_images) + 1
            img_nblks_cols(ui) = img_nblks_cols(ui) + 1
            local_g2l_map_cols(local_cols(col)) = img_nblks_cols(ui)
         END DO
      END IF
      !
!$OMP PARALLEL DEFAULT (NONE) &
!$OMP PRIVATE (ithread,myt,iter,row,col,blk,row_size,col_size,&
!$OMP          stored_row,stored_col,blk_p,bp,tr,&
!$OMP          nze,symmetry_i,row_img,col_img,rowi,coli,&
!$OMP          tr_row_size,tr_col_size,prow,pcol,dst_proc,&
!$OMP          data_buffer_p,meta_buffer_p,&
!$OMP          mi,ui,it,data_block) &
!$OMP SHARED (nthreads,send_sizes,offset_data,matrix,nsymmetries,do_symmetry,&
!$OMP         row_img_dist,col_img_dist,imgdist,row_dist,col_dist,&
!$OMP         is_left,my_size_send,my_size_recv,nimages,&
!$OMP         local_images_size,data_type,memtype_mpi_buffer,sm,&
!$OMP         img_nblks_cols,img_nblks_rows,mynode,offset_threads,&
!$OMP         local_g2l_map_cols,local_g2l_map_rows,recv_sizes,grp,make_buffers_meta_send,&
!$OMP         scale_value,scale_neg_one,make_buffers_data_send,make_buffers_data_recv,&
!$OMP         size_index,recv_displ_proc,recv_size_proc,send_size_proc,&
!$OMP         mp_obj,threads_dist,make_buffers_meta_recv,nrows_images,ncols_images,&
!$OMP         locks,blacs2mpi,send_displ_proc,recv_displs,send_displs,&
!$OMP         left_images_size,right_images_size,local_images_displ,requests,&
!$OMP         buffer,left_total_row_counts,otf_filtering,&
!$OMP         irequests,do_win_create,handle2,nprocs_total,&
!$OMP         do_crop,do_part_crop_row,do_part_crop_col,block_row_bounds,block_col_bounds,&
!$OMP         do_part_crop_f_row,do_part_crop_l_row,do_part_crop_f_col,do_part_crop_l_col,&
!$OMP         f_row_f,l_row_l,f_col_f,l_col_l,requests_win_create)
      ithread = 0
!$    ithread = omp_get_thread_num()
      myt = ithread
      IF (is_left) THEN
         rowi => mi
         coli => ui
      ELSE
         rowi => ui
         coli => mi
      END IF
!$OMP MASTER
      nthreads = 1
!$    nthreads = omp_get_num_threads()
      ALLOCATE (send_sizes(idata:imeta, 0:nthreads - 1, &
                           nimages, 0:nprocs_total - 1))
      send_sizes(:, :, :, :) = 0
      !
      size_index = 0
!$    IF (is_left) THEN
!$       size_index = nthreads + 1
!$    END IF
!$    IF (is_left .AND. do_symmetry) THEN
!$       ALLOCATE (locks(0:nthreads - 1))
!$    END IF
!$OMP END MASTER
!$OMP BARRIER
!$    IF (is_left .AND. do_symmetry) THEN
!$       call omp_init_lock(locks(ithread))
!$    END IF
      !
      ! Take data and meta dimensions per each thread, image, proc
      CALL dbcsr_iterator_start(iter, matrix, shared=.TRUE.)
      DO WHILE (dbcsr_iterator_blocks_left(iter))
         CALL dbcsr_iterator_next_block(iter, row, col, blk, &
                                        row_size=row_size, col_size=col_size)
         nze = row_size*col_size
         IF (nze .EQ. 0) CYCLE
         DO symmetry_i = 1, nsymmetries
            IF (symmetry_i .EQ. 1) THEN
               stored_row = row; stored_col = col
            ELSE
               IF (row .EQ. col) CYCLE
               stored_row = col; stored_col = row
            END IF
            ! Apply cropping
            IF (do_crop) THEN
               IF (stored_row .LT. block_row_bounds(1)) CYCLE
               IF (stored_row .GT. block_row_bounds(2)) CYCLE
               IF (stored_col .LT. block_col_bounds(1)) CYCLE
               IF (stored_col .GT. block_col_bounds(2)) CYCLE
            END IF
            row_img = row_img_dist(stored_row)
            col_img = col_img_dist(stored_col)
            CALL image_calculator(imgdist, &
                                  prow=prow, pcol=pcol, &
                                  rowi=rowi, coli=coli, &
                                  myprow=row_dist(stored_row), myrowi=row_img, &
                                  mypcol=col_dist(stored_col), mycoli=col_img, &
                                  shifting='0')
            dst_proc = blacs2mpi(prow, pcol)
!$          IF (is_left .AND. do_symmetry) THEN
!$             myt = threads_dist(stored_row)
!$          END IF
!$OMP ATOMIC
            send_sizes(imeta, myt, ui, dst_proc) = &
               send_sizes(imeta, myt, ui, dst_proc) + 3
!$OMP ATOMIC
            send_sizes(idata, myt, ui, dst_proc) = &
               send_sizes(idata, myt, ui, dst_proc) + nze
         END DO ! symmetry_i
      END DO
      CALL dbcsr_iterator_stop(iter)
!$OMP BARRIER
!$OMP MASTER
      ! Exchange refs
      ALLOCATE (recv_sizes(idata:imeta, 0:nthreads - 1, &
                           nimages, 0:nprocs_total - 1))
      CALL timeset(routineN//"_sizes", handle2)
      CALL mp_alltoall(send_sizes(:, :, :, :), &
                       recv_sizes(:, :, :, :), &
                       2*nimages*nthreads, grp)
      CALL timestop(handle2)
      !
      ! Evaluate the local size for each image, accumulating over threads and procs.
      ! Take the local displacement for each image.
      ! Note that displacement starts at zero.
      my_size_recv(:) = 0
      local_images_size(:, :) = 0
      ALLOCATE (local_images_displ(idata:imeta, nimages))
      DO ui = 1, nimages
         local_images_displ(:, ui) = my_size_recv(:)
         DO dst_proc = 0, nprocs_total - 1
            DO it = 0, nthreads - 1
               local_images_size(:, ui) = local_images_size(:, ui) + &
                                          recv_sizes(:, it, ui, dst_proc)
            END DO
         END DO
         IF (local_images_size(imeta, ui) .EQ. 0) CYCLE
         ! Include stats slots for threads indices
         local_images_size(imeta, ui) = local_images_size(imeta, ui) + size_index
         my_size_recv(:) = my_size_recv(:) + local_images_size(:, ui)
      END DO
      !
      ! Exchange sizes
      IF (is_left) THEN
         CALL mp_iallgather(local_images_size, left_images_size, buffer%subgrp, requests(irequests))
      ELSE
         CALL mp_iallgather(local_images_size, right_images_size, buffer%subgrp, requests(irequests))
      END IF
      !
      ! Allocate data and meta buffers
      do_win_create(:) = .NOT. buffer%has_rma_win
      IF (buffer%has_rma_win) THEN
         IF (buffer%grp .NE. grp .OR. dbcsr_data_get_type(buffer%data) .NE. data_type) THEN
            do_win_create(:) = .TRUE.
         END IF
      END IF
      CALL buffer_init(buffer, data_type, &
                       my_size_recv(idata), my_size_recv(imeta), &
                       data_memory_type=memtype_mpi_buffer)
      buffer%grp = grp
      !
      ! Set send and recv buffers sizes and displacements for each proc.
      ! Accumulate over images and threads.
      ! Here displacement starts at one.
      ALLOCATE (send_displs(idata:imeta, 0:nthreads - 1, &
                            nimages, 0:nprocs_total - 1))
      ! Displs for local data arrangement, starting at one.
      ALLOCATE (recv_displs(idata:imeta, 0:nthreads - 1, &
                            nimages, 0:nprocs_total - 1))
      ! Here displacement starts at zero.
      ALLOCATE (send_size_proc(idata:imeta, 0:nprocs_total - 1))
      ALLOCATE (recv_size_proc(idata:imeta, 0:nprocs_total - 1))
      ALLOCATE (send_displ_proc(idata:imeta, 0:nprocs_total - 1))
      ALLOCATE (recv_displ_proc(idata:imeta, 0:nprocs_total - 1))
      my_size_send(:) = 1
      my_size_recv(:) = 1
      DO dst_proc = 0, nprocs_total - 1
         send_displ_proc(:, dst_proc) = my_size_send(:) - 1
         recv_displ_proc(:, dst_proc) = my_size_recv(:) - 1
         ! Avoid communication of local data
         IF (dst_proc .NE. mynode) THEN
            DO ui = 1, nimages
               DO it = 0, nthreads - 1
                  send_displs(:, it, ui, dst_proc) = my_size_send(:)
                  recv_displs(:, it, ui, dst_proc) = my_size_recv(:)
                  my_size_send(:) = my_size_send(:) + send_sizes(:, it, ui, dst_proc)
                  my_size_recv(:) = my_size_recv(:) + recv_sizes(:, it, ui, dst_proc)
               END DO
            END DO
         ELSE
            ! Reset all
            send_displs(:, :, :, dst_proc) = 0
            recv_displs(:, :, :, dst_proc) = 0
         END IF
         send_size_proc(:, dst_proc) = my_size_send(:) - send_displ_proc(:, dst_proc) - 1
         recv_size_proc(:, dst_proc) = my_size_recv(:) - recv_displ_proc(:, dst_proc) - 1
      END DO
      !
      ! Allocate data/meta to send
      IF (dbcsr_data_valid(make_buffers_data_send)) THEN
         IF (dbcsr_data_get_type(make_buffers_data_send) .NE. data_type) THEN
            CALL dbcsr_data_release(make_buffers_data_send)
         END IF
      END IF
      IF (dbcsr_data_valid(make_buffers_data_send)) THEN
         CALL dbcsr_data_ensure_size(make_buffers_data_send, my_size_send(idata) - 1, nocopy=.TRUE.)
      ELSE
         CALL dbcsr_data_init(make_buffers_data_send)
         CALL dbcsr_data_new(make_buffers_data_send, data_type, my_size_send(idata) - 1, &
                             memory_type=memtype_mpi_buffer)
      END IF
      CALL ensure_array_size(make_buffers_meta_send, ub=my_size_send(imeta) - 1, &
                             nocopy=.TRUE., memory_type=memtype_mpi_buffer)
      ! Displs for data offset
      ALLOCATE (offset_threads(idata:imeta, 0:nthreads - 1, nimages))
      offset_threads(:, :, :) = 0
      ! Set offset for local data
      ALLOCATE (offset_data(0:nthreads - 1, nimages, 0:nprocs_total - 1))
      offset_data(:, :, :) = 1
      ! Evaluate local displs
      DO ui = 1, nimages
         IF (local_images_size(imeta, ui) .EQ. 0) CYCLE
         offset_threads(:, 0, ui) = 0
         DO it = 1, nthreads - 1
            offset_threads(:, it, ui) = offset_threads(:, it - 1, ui)
            DO dst_proc = 0, nprocs_total - 1
               offset_threads(:, it, ui) = offset_threads(:, it, ui) + &
                                           recv_sizes(:, it - 1, ui, dst_proc)
            END DO
         END DO
         ! Fill meta indices for threads
!$       IF (is_left) THEN
!$          buffer%meta(local_images_displ(imeta, ui) + 1:local_images_displ(imeta, ui) + nthreads) = &
!$             offset_threads(imeta, :, ui)/3
!$          buffer%meta(local_images_displ(imeta, ui) + size_index) = &
!$             (local_images_size(imeta, ui) - size_index)/3
!$       END IF
         offset_threads(imeta, :, ui) = offset_threads(imeta, :, ui) + local_images_displ(imeta, ui) + size_index + 1
         send_displs(:, :, ui, mynode) = offset_threads(:, :, ui)
         !
         ! Allow ordering by proc for insertion
         DO dst_proc = 0, mynode - 1
            DO it = 0, nthreads - 1
               send_displs(:, it, ui, mynode) = send_displs(:, it, ui, mynode) + &
                                                recv_sizes(:, it, ui, dst_proc)
            END DO
         END DO
         offset_data(:, ui, mynode) = send_displs(idata, :, ui, mynode) + 1
         send_displs(idata, :, ui, mynode) = send_displs(idata, :, ui, mynode) + local_images_displ(idata, ui) + 1
      END DO
!$OMP END MASTER
!$OMP BARRIER
      !
      IF (do_part_crop_row .OR. do_part_crop_col) THEN
         CALL dbcsr_data_init(data_block)
         CALL dbcsr_data_new(data_block, dbcsr_type_1d_to_2d(data_type))
      END IF
      !
      ! Copy data and meta in the buffers
      CALL timeset(routineN//"_pack", handle2)
      CALL dbcsr_iterator_start(iter, matrix, shared=.TRUE.)
      DO WHILE (dbcsr_iterator_blocks_left(iter))
         CALL dbcsr_iterator_next_block(iter, row, col, blk, blk_p=blk_p, &
                                        row_size=row_size, col_size=col_size)
         nze = row_size*col_size
         IF (nze .EQ. 0) CYCLE
         bp = ABS(blk_p)
         DO symmetry_i = 1, nsymmetries
            IF (symmetry_i .EQ. 1) THEN
               stored_row = row; stored_col = col; tr = blk_p .LT. 0
               tr_row_size = col_size; tr_col_size = row_size
            ELSE
               IF (row .EQ. col) CYCLE
               stored_row = col; stored_col = row; tr = blk_p .GT. 0
               tr_row_size = row_size; tr_col_size = col_size
            END IF
            ! Apply cropping
            IF (do_crop) THEN
               IF (stored_row .LT. block_row_bounds(1)) CYCLE
               IF (stored_row .GT. block_row_bounds(2)) CYCLE
               IF (stored_col .LT. block_col_bounds(1)) CYCLE
               IF (stored_col .GT. block_col_bounds(2)) CYCLE
            END IF
            row_img = row_img_dist(stored_row)
            col_img = col_img_dist(stored_col)
            CALL image_calculator(imgdist, &
                                  prow=prow, pcol=pcol, &
                                  rowi=rowi, coli=coli, &
                                  myprow=row_dist(stored_row), myrowi=row_img, &
                                  mypcol=col_dist(stored_col), mycoli=col_img, &
                                  shifting='0')
            dst_proc = blacs2mpi(prow, pcol)
            IF (dst_proc .EQ. mynode) THEN
               data_buffer_p => buffer%data
               meta_buffer_p => buffer%meta
            ELSE
               data_buffer_p => make_buffers_data_send
               meta_buffer_p => make_buffers_meta_send
            END IF
!$          IF (is_left .AND. do_symmetry) THEN
!$             myt = threads_dist(stored_row)
!$             call omp_set_lock(locks(myt))
!$          END IF
            IF (tr) THEN
               CALL dbcsr_block_transpose_aa(data_buffer_p, sm%data_area, tr_row_size, tr_col_size, &
                                             send_displs(idata, myt, ui, dst_proc), bp, &
                                             scale_value)
               IF (sm%negate_real .AND. sm%negate_imaginary) THEN
                  CALL dbcsr_block_scale(data_buffer_p, scale=scale_neg_one, &
                                         row_size=nze, col_size=1, &
                                         lb=send_displs(idata, myt, ui, dst_proc))
               ELSEIF (sm%negate_real) THEN
                  CALL dbcsr_block_real_neg(data_buffer_p, row_size=nze, col_size=1, &
                                            lb=send_displs(idata, myt, ui, dst_proc))
               ELSEIF (sm%negate_imaginary) THEN
                  CALL dbcsr_block_conjg(data_buffer_p, row_size=nze, col_size=1, &
                                         lb=send_displs(idata, myt, ui, dst_proc))
               END IF
            ELSE
               CALL dbcsr_block_copy_aa(data_buffer_p, sm%data_area, row_size, col_size, &
                                        send_displs(idata, myt, ui, dst_proc), bp, &
                                        scale_value)
            END IF
            !
            ! Apply cropping for partial blocks
            IF (do_part_crop_row .OR. do_part_crop_col) THEN
               CALL dbcsr_data_set_pointer( &
                  area=data_block, &
                  rsize=row_size, &
                  csize=col_size, &
                  pointee=data_buffer_p, &
                  source_lb=send_displs(idata, myt, ui, dst_proc))
               IF (do_part_crop_row) THEN
                  IF (do_part_crop_f_row .AND. stored_row .EQ. block_row_bounds(1)) THEN
                     CALL dbcsr_data_clear(data_block, ub=f_row_f)
                  END IF
                  IF (do_part_crop_l_row .AND. stored_row .EQ. block_row_bounds(2)) THEN
                     CALL dbcsr_data_clear(data_block, lb=l_row_l)
                  END IF
               END IF
               IF (do_part_crop_col) THEN
                  IF (do_part_crop_f_col .AND. stored_col .EQ. block_col_bounds(1)) THEN
                     CALL dbcsr_data_clear(data_block, ub2=f_col_f)
                  END IF
                  IF (do_part_crop_l_col .AND. stored_col .EQ. block_col_bounds(2)) THEN
                     CALL dbcsr_data_clear(data_block, lb2=l_col_l)
                  END IF
               END IF
            END IF
            !
            ! Set meta data (global or local indexing)
            IF (dst_proc .EQ. mynode) THEN
               stored_row = local_g2l_map_rows(stored_row)
               stored_col = local_g2l_map_cols(stored_col)
               ! Count the maximum possible multiplies per row for on-the-fly filtering
               IF (is_left .AND. otf_filtering) THEN
                  left_total_row_counts(stored_row) = &
                     left_total_row_counts(stored_row) + 1
               END IF
            END IF
            meta_buffer_p(send_displs(imeta, myt, ui, dst_proc)) = stored_row
            meta_buffer_p(send_displs(imeta, myt, ui, dst_proc) + 1) = stored_col
            meta_buffer_p(send_displs(imeta, myt, ui, dst_proc) + 2) = offset_data(myt, ui, dst_proc)
            !
            send_displs(imeta, myt, ui, dst_proc) = send_displs(imeta, myt, ui, dst_proc) + 3
            send_displs(idata, myt, ui, dst_proc) = send_displs(idata, myt, ui, dst_proc) + nze
            offset_data(myt, ui, dst_proc) = offset_data(myt, ui, dst_proc) + nze
!$          IF (is_left .AND. do_symmetry) THEN
!$             call omp_unset_lock(locks(myt))
!$          END IF
         END DO
      END DO
      CALL dbcsr_iterator_stop(iter)
      CALL timestop(handle2)
      !
      IF (do_part_crop_row .OR. do_part_crop_col) THEN
         CALL dbcsr_data_clear_pointer(data_block)
         CALL dbcsr_data_release(data_block)
      END IF
      !
!$OMP BARRIER
!$OMP MASTER
      !
      ! Allocate data/meta to recv
      IF (dbcsr_data_valid(make_buffers_data_recv)) THEN
         IF (dbcsr_data_get_type(make_buffers_data_recv) .NE. data_type) THEN
            CALL dbcsr_data_release(make_buffers_data_recv)
         END IF
      END IF
      IF (dbcsr_data_valid(make_buffers_data_recv)) THEN
         CALL dbcsr_data_ensure_size(make_buffers_data_recv, my_size_recv(idata) - 1, nocopy=.TRUE.)
      ELSE
         CALL dbcsr_data_init(make_buffers_data_recv)
         CALL dbcsr_data_new(make_buffers_data_recv, data_type, my_size_recv(idata) - 1, &
                             memory_type=memtype_mpi_buffer)
      END IF
      CALL ensure_array_size(make_buffers_meta_recv, ub=my_size_recv(imeta) - 1, &
                             nocopy=.TRUE., memory_type=memtype_mpi_buffer)
      ! Exchange data
      CALL timeset(routineN//"_data", handle2)
      CALL hybrid_alltoall_any(make_buffers_data_send, send_size_proc(idata, :), send_displ_proc(idata, :), &
                               make_buffers_data_recv, recv_size_proc(idata, :), recv_displ_proc(idata, :), &
                               mp_obj, &
                               most_ptp=.TRUE., remainder_ptp=.TRUE., no_hybrid=.FALSE.)
      CALL hybrid_alltoall_i1(make_buffers_meta_send, send_size_proc(imeta, :), send_displ_proc(imeta, :), &
                              make_buffers_meta_recv, recv_size_proc(imeta, :), recv_displ_proc(imeta, :), &
                              mp_obj, &
                              most_ptp=.TRUE., remainder_ptp=.TRUE., no_hybrid=.FALSE.)
      CALL timestop(handle2)
!$OMP END MASTER
!$OMP BARRIER
!$    IF (is_left .AND. do_symmetry) THEN
!$       call omp_destroy_lock(locks(ithread))
!$    END IF
      !
      ! Arrange data in the local buffers in images
      data_buffer_p => buffer%data
      meta_buffer_p => buffer%meta
      DO ui = 1, nimages
         ! Check for empty images
         IF (local_images_size(imeta, ui) .EQ. 0) CYCLE
         DO dst_proc = 0, nprocs_total - 1
            IF (recv_sizes(imeta, ithread, ui, dst_proc) .EQ. 0) CYCLE
            ! Skip local data
            IF (dst_proc .EQ. mynode) THEN
               offset_threads(:, ithread, ui) = offset_threads(:, ithread, ui) + &
                                                recv_sizes(:, ithread, ui, dst_proc)
            ELSE
               ! Copy meta, block by block
               DO blk = recv_displs(imeta, ithread, ui, dst_proc), &
                  recv_displs(imeta, ithread, ui, dst_proc) + recv_sizes(imeta, ithread, ui, dst_proc) - 1, 3
                  stored_row = local_g2l_map_rows(make_buffers_meta_recv(blk))
                  stored_col = local_g2l_map_cols(make_buffers_meta_recv(blk + 1))
                  meta_buffer_p(offset_threads(imeta, ithread, ui)) = stored_row
                  meta_buffer_p(offset_threads(imeta, ithread, ui) + 1) = stored_col
                  meta_buffer_p(offset_threads(imeta, ithread, ui) + 2) = make_buffers_meta_recv(blk + 2) + &
                                                                          offset_threads(idata, ithread, ui)
                  offset_threads(imeta, ithread, ui) = offset_threads(imeta, ithread, ui) + 3
                  ! Count the maximum possible multiplies per row for on-the-fly filtering
                  IF (is_left .AND. otf_filtering) THEN
!$OMP ATOMIC
                     left_total_row_counts(stored_row) = &
                        left_total_row_counts(stored_row) + 1
                  END IF
               END DO
               ! Copy data
               CALL dbcsr_data_set(data_buffer_p, &
                                   offset_threads(idata, ithread, ui) + local_images_displ(idata, ui) + 1, &
                                   recv_sizes(idata, ithread, ui, dst_proc), &
                                   make_buffers_data_recv, recv_displs(idata, ithread, ui, dst_proc))
               offset_threads(idata, ithread, ui) = offset_threads(idata, ithread, ui) + &
                                                    recv_sizes(idata, ithread, ui, dst_proc)
            END IF
         END DO
      END DO
!$OMP END PARALLEL
      DEALLOCATE (send_sizes, recv_sizes)
      DEALLOCATE (send_displs, recv_displs, offset_data, offset_threads)
      DEALLOCATE (send_size_proc, send_displ_proc, recv_size_proc, recv_displ_proc)
      !
      IF (is_left .AND. otf_filtering) THEN
         CALL mp_isum(left_total_row_counts, dbcsr_mp_my_row_group(mp_obj), request_count_rows)
      END IF
      !
      CALL setup_rec_index_images(buffer%meta, img_nblks_rows, img_nblks_cols, &
                                  local_images_size(imeta, :), local_images_displ(imeta, :), &
                                  size_index, is_left)
      IF (buffer%has_rma_win) THEN
         do_win_create(1) = do_win_create(1) .OR. dbcsr_data_exists(buffer%data_before_resize)
         do_win_create(2) = do_win_create(2) .OR. ASSOCIATED(buffer%meta_before_resize)
         CALL mp_isum(do_win_create, buffer%subgrp, requests_win_create(irequests))
      END IF
      !
      IF (is_left) THEN
         NULLIFY (local_g2l_map_rows)
         DEALLOCATE (local_g2l_map_cols)
      ELSE
         DEALLOCATE (local_g2l_map_rows)
         NULLIFY (local_g2l_map_cols)
      END IF
!$    IF (is_left .AND. do_symmetry) THEN
!$       DEALLOCATE (locks)
!$    END IF
      !
      DEALLOCATE (img_nblks_rows, img_nblks_cols)
      DEALLOCATE (local_images_displ)
      !
      CALL timestop(handle)
   END SUBROUTINE make_buffers