Prepare orig images for MPI windows
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
type(dbcsr_type), | intent(in) | :: | matrix | |||
type(dbcsr_imagedistribution_obj), | intent(in) | :: | imgdist | |||
logical, | intent(in) | :: | is_left | |||
integer, | intent(in) | :: | f_row | |||
integer, | intent(in) | :: | l_row | |||
integer, | intent(in) | :: | f_col | |||
integer, | intent(in) | :: | l_col | |||
logical, | intent(in) | :: | otf_filtering | |||
type(dbcsr_scalar_type), | intent(in), | optional | :: | scale_value |
SUBROUTINE make_buffers(matrix, imgdist, is_left, & !! Prepare orig images for MPI windows f_row, l_row, f_col, l_col, & otf_filtering, scale_value) TYPE(dbcsr_type), INTENT(IN) :: matrix TYPE(dbcsr_imagedistribution_obj), INTENT(IN) :: imgdist LOGICAL, INTENT(IN) :: is_left INTEGER, INTENT(IN) :: f_row, l_row, f_col, l_col LOGICAL, INTENT(IN) :: otf_filtering TYPE(dbcsr_scalar_type), INTENT(IN), OPTIONAL :: scale_value CHARACTER(len=*), PARAMETER :: routineN = 'make_buffers' INTEGER :: blk, blk_p, bp, col, col_img, col_size, data_type, dst_proc, f_col_f, f_row_f, & handle, handle2, irequests, it, ithread, l_col_l, l_row_l, mynode, myt, & nblkcols_local, nblkrows_local, ncols_images, nimages, nprocs, nprocs_total, & nrows_images, nsymmetries, nthreads, nze, pcol, prow, row, row_img, row_size, size_index, & stored_col, stored_row, symmetry_i, tr_col_size, tr_row_size INTEGER, ALLOCATABLE, DIMENSION(:) :: img_nblks_cols, img_nblks_rows INTEGER, ALLOCATABLE, DIMENSION(:, :) :: local_images_displ, recv_displ_proc, & recv_size_proc, send_displ_proc, & send_size_proc INTEGER, ALLOCATABLE, DIMENSION(:, :, :) :: offset_data, offset_threads INTEGER, ALLOCATABLE, DIMENSION(:, :, :, :) :: recv_displs, recv_sizes, send_displs, & send_sizes INTEGER, DIMENSION(2) :: block_col_bounds, block_row_bounds INTEGER, DIMENSION(:), POINTER, CONTIGUOUS :: col_dist, col_img_dist, local_cols, local_g2l_map_cols, & local_g2l_map_rows, local_rows, meta_buffer_p, & row_dist, row_img_dist, threads_dist INTEGER, DIMENSION(:, :), POINTER, CONTIGUOUS :: blacs2mpi, local_images_size INTEGER, DIMENSION(idata:imeta) :: my_size_recv, my_size_send INTEGER, POINTER :: coli, rowi INTEGER, TARGET :: mi, ui LOGICAL :: do_crop, do_part_crop_col, do_part_crop_f_col, do_part_crop_f_row, & do_part_crop_l_col, do_part_crop_l_row, do_part_crop_row, do_symmetry, tr LOGICAL, DIMENSION(:), POINTER, CONTIGUOUS :: do_win_create TYPE(dbcsr_buffer), POINTER :: buffer TYPE(dbcsr_data_obj) :: data_block TYPE(dbcsr_data_obj), POINTER :: data_buffer_p TYPE(dbcsr_distribution_obj) :: set_dist TYPE(dbcsr_iterator) :: iter TYPE(dbcsr_mp_obj) :: mp_obj TYPE(dbcsr_scalar_type) :: scale_neg_one TYPE(dbcsr_type) :: sm TYPE(mp_comm_type) :: grp !$ INTEGER(kind=omp_lock_kind), ALLOCATABLE, DIMENSION(:) :: locks CALL timeset(routineN, handle) ! Sync with previous multiplication IF (request_sync_mult .NE. mp_request_null) & DBCSR_ABORT("Multiplications are not in sync!") ! ! Take input values and check validity IF (.NOT. dbcsr_valid_index(matrix)) & DBCSR_ABORT("Matrix not initialized.") sm = matrix data_type = sm%data_type IF (data_type .NE. dbcsr_type_real_8 .AND. & data_type .NE. dbcsr_type_real_4 .AND. & data_type .NE. dbcsr_type_complex_8 .AND. & data_type .NE. dbcsr_type_complex_4) & DBCSR_ABORT("Invalid data type.") scale_neg_one = dbcsr_scalar_negative(dbcsr_scalar_one(data_type)) set_dist = imgdist%i%main row_dist => dbcsr_distribution_row_dist(set_dist) col_dist => dbcsr_distribution_col_dist(set_dist) local_rows => dbcsr_distribution_local_rows(set_dist) local_cols => dbcsr_distribution_local_cols(set_dist) nblkrows_local = SIZE(local_rows) nblkcols_local = SIZE(local_cols) IF (sm%symmetry) THEN IF (SIZE(row_dist) .NE. SIZE(col_dist)) & DBCSR_WARN('Unequal row and column distributions for symmetric matrix.') END IF nrows_images = imgdist%i%row_decimation row_img_dist => array_data(imgdist%i%row_image) ncols_images = imgdist%i%col_decimation col_img_dist => array_data(imgdist%i%col_image) mp_obj = dbcsr_distribution_mp(imgdist%i%main) CALL dbcsr_mp_grid_setup(mp_obj) nprocs_total = dbcsr_mp_numnodes(mp_obj) mynode = dbcsr_mp_mynode(mp_obj) grp = dbcsr_mp_group(mp_obj) blacs2mpi => dbcsr_mp_pgrid(mp_obj) IF (dbcsr_distribution_max_row_dist(set_dist) .GT. UBOUND(blacs2mpi, 1)) & DBCSR_ABORT("Row distribution references unexistent processor rows") IF (dbcsr_distribution_max_col_dist(set_dist) .GT. UBOUND(blacs2mpi, 2)) & DBCSR_ABORT("Col distribution references unexistent processor cols") ! Check threads configuration NULLIFY (threads_dist) !$ IF (.NOT. dbcsr_distribution_has_threads(dbcsr_distribution(matrix))) & !$ DBCSR_ABORT("Thread distribution not defined") !$ threads_dist => array_data(dbcsr_distribution_thread_dist(dbcsr_distribution(matrix))) IF (is_left) THEN IF (nrows_images .GT. 1) & DBCSR_ABORT("Row nimages for left matrix is not 1!") ELSE IF (ncols_images .GT. 1) & DBCSR_ABORT("Col nimages for right matrix is not 1!") END IF ! ! Crop matrix do_crop = .FALSE. do_part_crop_row = .FALSE. do_part_crop_col = .FALSE. do_part_crop_f_row = .FALSE. do_part_crop_l_row = .FALSE. do_part_crop_f_col = .FALSE. do_part_crop_l_col = .FALSE. ! Set no limits IF (ANY((/f_row, l_row, f_col, l_col/) .NE. 0)) THEN IF (f_row .LT. 0) & DBCSR_ABORT("Invalid first row bound.") IF (l_row .GT. dbcsr_nfullrows_total(matrix)) & DBCSR_ABORT("Invalid last row bound.") IF (f_col .LT. 0) & DBCSR_ABORT("Invalid first column bound.") IF (l_col .GT. dbcsr_nfullcols_total(matrix)) & DBCSR_ABORT("Invalid last column bound.") ! do_crop = .TRUE. ! ! Convert bounds to block addressing IF (f_row .EQ. 0) THEN block_row_bounds(1) = 1 ELSE CALL find_block_of_element(f_row, block_row_bounds(1), & dbcsr_nblkrows_total(matrix), & dbcsr_row_block_offsets(matrix), & hint=0) do_part_crop_f_row = array_get(dbcsr_row_block_offsets(matrix), block_row_bounds(1)) .NE. f_row IF (do_part_crop_f_row) THEN ! Block offset of last cleared row f_row_f = f_row - array_get(dbcsr_row_block_offsets(matrix), block_row_bounds(1)) END IF END IF ! IF (l_row .EQ. 0) THEN block_row_bounds(2) = dbcsr_nblkrows_total(matrix) ELSE CALL find_block_of_element(l_row, block_row_bounds(2), & dbcsr_nblkrows_total(matrix), & dbcsr_row_block_offsets(matrix), & hint=0) do_part_crop_l_row = (array_get(dbcsr_row_block_offsets(matrix), block_row_bounds(2) + 1) - 1) .NE. l_row IF (do_part_crop_l_row) THEN ! Block offset of first cleared row l_row_l = 2 + l_row - array_get(dbcsr_row_block_offsets(matrix), block_row_bounds(2)) END IF END IF do_part_crop_row = do_part_crop_f_row .OR. do_part_crop_l_row ! IF (f_col .EQ. 0) THEN block_col_bounds(1) = 1 ELSE CALL find_block_of_element(f_col, block_col_bounds(1), & dbcsr_nblkcols_total(matrix), & dbcsr_col_block_offsets(matrix), & hint=0) do_part_crop_f_col = array_get(dbcsr_col_block_offsets(matrix), block_col_bounds(1)) .NE. f_col IF (do_part_crop_f_col) THEN ! Block offset of last cleared col f_col_f = f_col - array_get(dbcsr_col_block_offsets(matrix), block_col_bounds(1)) END IF END IF ! IF (l_col .EQ. 0) THEN block_col_bounds(2) = dbcsr_nblkcols_total(matrix) ELSE CALL find_block_of_element(l_col, block_col_bounds(2), & dbcsr_nblkcols_total(matrix), & dbcsr_col_block_offsets(matrix), & hint=0) do_part_crop_l_col = (array_get(dbcsr_col_block_offsets(matrix), block_col_bounds(2) + 1) - 1) .NE. l_col IF (do_part_crop_l_col) THEN ! Block offset of first cleared col l_col_l = 2 + l_col - array_get(dbcsr_col_block_offsets(matrix), block_col_bounds(2)) END IF END IF do_part_crop_col = do_part_crop_f_col .OR. do_part_crop_l_col END IF ! IF (dbcsr_has_symmetry(matrix)) THEN nsymmetries = 2 do_symmetry = .TRUE. ELSE nsymmetries = 1 do_symmetry = .FALSE. END IF ! IF (is_left) THEN nimages = ncols_images buffer => buffers_win%left nprocs = dbcsr_mp_npcols(mp_obj) ALLOCATE (left_images_size(idata:imeta, & nimages, & MAX(1, dbcsr_mp_nprows(mp_obj)/layers_3D_C_reduction%side3D), & 0:nprocs - 1)) ALLOCATE (left_local_images_size(idata:imeta, nimages)) local_images_size => left_local_images_size irequests = 1 ! ! Count the maximum possible multiplies per row for on-the-fly filtering IF (otf_filtering) THEN ALLOCATE (left_total_row_counts(nblkrows_local)) left_total_row_counts = 0 END IF do_win_create => do_win_create_left ELSE nimages = nrows_images buffer => buffers_win%right nprocs = dbcsr_mp_nprows(mp_obj) ALLOCATE (right_images_size(idata:imeta, & nimages, & MAX(1, dbcsr_mp_npcols(mp_obj)/layers_3D_C_reduction%side3D), & 0:nprocs - 1)) ALLOCATE (right_local_images_size(idata:imeta, nimages)) local_images_size => right_local_images_size irequests = 2 do_win_create => do_win_create_right END IF ! ! 3D communicator CALL make_layers_3D_AB(layers_3D_C_reduction%num_layers_3D, & layers_3D_C_reduction%side3D, & mp_obj, is_left, buffer) ! ! Evaluate maps for global -> local indexing (g2l_map_rows, g2l_map_cols) ! Count the number of blocks per row/column (img_nblks_rows, img_nblks_cols) IF (is_left) THEN ALLOCATE (g2l_map_rows(sm%nblkrows_total)) local_g2l_map_rows => g2l_map_rows ALLOCATE (local_g2l_map_cols(sm%nblkcols_total)) ALLOCATE (img_nblks_rows(1), img_nblks_cols(nimages)) ELSE ALLOCATE (g2l_map_cols(sm%nblkcols_total)) local_g2l_map_cols => g2l_map_cols ALLOCATE (local_g2l_map_rows(sm%nblkrows_total)) ALLOCATE (img_nblks_rows(nimages), img_nblks_cols(1)) END IF ! local_g2l_map_rows(:) = 0 IF (nrows_images .EQ. 1) THEN img_nblks_rows(1) = nblkrows_local DO row = 1, nblkrows_local local_g2l_map_rows(local_rows(row)) = row END DO ELSE img_nblks_rows(:) = 0 DO row = 1, nblkrows_local row_img = row_img_dist(local_rows(row)) ui = MOD(row_img - 1, nrows_images) + 1 img_nblks_rows(ui) = img_nblks_rows(ui) + 1 local_g2l_map_rows(local_rows(row)) = img_nblks_rows(ui) END DO END IF ! local_g2l_map_cols(:) = 0 IF (ncols_images .EQ. 1) THEN img_nblks_cols(1) = nblkcols_local DO col = 1, nblkcols_local local_g2l_map_cols(local_cols(col)) = col END DO ELSE img_nblks_cols(:) = 0 DO col = 1, nblkcols_local col_img = col_img_dist(local_cols(col)) ui = MOD(col_img - 1, ncols_images) + 1 img_nblks_cols(ui) = img_nblks_cols(ui) + 1 local_g2l_map_cols(local_cols(col)) = img_nblks_cols(ui) END DO END IF ! !$OMP PARALLEL DEFAULT (NONE) & !$OMP PRIVATE (ithread,myt,iter,row,col,blk,row_size,col_size,& !$OMP stored_row,stored_col,blk_p,bp,tr,& !$OMP nze,symmetry_i,row_img,col_img,rowi,coli,& !$OMP tr_row_size,tr_col_size,prow,pcol,dst_proc,& !$OMP data_buffer_p,meta_buffer_p,& !$OMP mi,ui,it,data_block) & !$OMP SHARED (nthreads,send_sizes,offset_data,matrix,nsymmetries,do_symmetry,& !$OMP row_img_dist,col_img_dist,imgdist,row_dist,col_dist,& !$OMP is_left,my_size_send,my_size_recv,nimages,& !$OMP local_images_size,data_type,memtype_mpi_buffer,sm,& !$OMP img_nblks_cols,img_nblks_rows,mynode,offset_threads,& !$OMP local_g2l_map_cols,local_g2l_map_rows,recv_sizes,grp,make_buffers_meta_send,& !$OMP scale_value,scale_neg_one,make_buffers_data_send,make_buffers_data_recv,& !$OMP size_index,recv_displ_proc,recv_size_proc,send_size_proc,& !$OMP mp_obj,threads_dist,make_buffers_meta_recv,nrows_images,ncols_images,& !$OMP locks,blacs2mpi,send_displ_proc,recv_displs,send_displs,& !$OMP left_images_size,right_images_size,local_images_displ,requests,& !$OMP buffer,left_total_row_counts,otf_filtering,& !$OMP irequests,do_win_create,handle2,nprocs_total,& !$OMP do_crop,do_part_crop_row,do_part_crop_col,block_row_bounds,block_col_bounds,& !$OMP do_part_crop_f_row,do_part_crop_l_row,do_part_crop_f_col,do_part_crop_l_col,& !$OMP f_row_f,l_row_l,f_col_f,l_col_l,requests_win_create) ithread = 0 !$ ithread = omp_get_thread_num() myt = ithread IF (is_left) THEN rowi => mi coli => ui ELSE rowi => ui coli => mi END IF !$OMP MASTER nthreads = 1 !$ nthreads = omp_get_num_threads() ALLOCATE (send_sizes(idata:imeta, 0:nthreads - 1, & nimages, 0:nprocs_total - 1)) send_sizes(:, :, :, :) = 0 ! size_index = 0 !$ IF (is_left) THEN !$ size_index = nthreads + 1 !$ END IF !$ IF (is_left .AND. do_symmetry) THEN !$ ALLOCATE (locks(0:nthreads - 1)) !$ END IF !$OMP END MASTER !$OMP BARRIER !$ IF (is_left .AND. do_symmetry) THEN !$ call omp_init_lock(locks(ithread)) !$ END IF ! ! Take data and meta dimensions per each thread, image, proc CALL dbcsr_iterator_start(iter, matrix, shared=.TRUE.) DO WHILE (dbcsr_iterator_blocks_left(iter)) CALL dbcsr_iterator_next_block(iter, row, col, blk, & row_size=row_size, col_size=col_size) nze = row_size*col_size IF (nze .EQ. 0) CYCLE DO symmetry_i = 1, nsymmetries IF (symmetry_i .EQ. 1) THEN stored_row = row; stored_col = col ELSE IF (row .EQ. col) CYCLE stored_row = col; stored_col = row END IF ! Apply cropping IF (do_crop) THEN IF (stored_row .LT. block_row_bounds(1)) CYCLE IF (stored_row .GT. block_row_bounds(2)) CYCLE IF (stored_col .LT. block_col_bounds(1)) CYCLE IF (stored_col .GT. block_col_bounds(2)) CYCLE END IF row_img = row_img_dist(stored_row) col_img = col_img_dist(stored_col) CALL image_calculator(imgdist, & prow=prow, pcol=pcol, & rowi=rowi, coli=coli, & myprow=row_dist(stored_row), myrowi=row_img, & mypcol=col_dist(stored_col), mycoli=col_img, & shifting='0') dst_proc = blacs2mpi(prow, pcol) !$ IF (is_left .AND. do_symmetry) THEN !$ myt = threads_dist(stored_row) !$ END IF !$OMP ATOMIC send_sizes(imeta, myt, ui, dst_proc) = & send_sizes(imeta, myt, ui, dst_proc) + 3 !$OMP ATOMIC send_sizes(idata, myt, ui, dst_proc) = & send_sizes(idata, myt, ui, dst_proc) + nze END DO ! symmetry_i END DO CALL dbcsr_iterator_stop(iter) !$OMP BARRIER !$OMP MASTER ! Exchange refs ALLOCATE (recv_sizes(idata:imeta, 0:nthreads - 1, & nimages, 0:nprocs_total - 1)) CALL timeset(routineN//"_sizes", handle2) CALL mp_alltoall(send_sizes(:, :, :, :), & recv_sizes(:, :, :, :), & 2*nimages*nthreads, grp) CALL timestop(handle2) ! ! Evaluate the local size for each image, accumulating over threads and procs. ! Take the local displacement for each image. ! Note that displacement starts at zero. my_size_recv(:) = 0 local_images_size(:, :) = 0 ALLOCATE (local_images_displ(idata:imeta, nimages)) DO ui = 1, nimages local_images_displ(:, ui) = my_size_recv(:) DO dst_proc = 0, nprocs_total - 1 DO it = 0, nthreads - 1 local_images_size(:, ui) = local_images_size(:, ui) + & recv_sizes(:, it, ui, dst_proc) END DO END DO IF (local_images_size(imeta, ui) .EQ. 0) CYCLE ! Include stats slots for threads indices local_images_size(imeta, ui) = local_images_size(imeta, ui) + size_index my_size_recv(:) = my_size_recv(:) + local_images_size(:, ui) END DO ! ! Exchange sizes IF (is_left) THEN CALL mp_iallgather(local_images_size, left_images_size, buffer%subgrp, requests(irequests)) ELSE CALL mp_iallgather(local_images_size, right_images_size, buffer%subgrp, requests(irequests)) END IF ! ! Allocate data and meta buffers do_win_create(:) = .NOT. buffer%has_rma_win IF (buffer%has_rma_win) THEN IF (buffer%grp .NE. grp .OR. dbcsr_data_get_type(buffer%data) .NE. data_type) THEN do_win_create(:) = .TRUE. END IF END IF CALL buffer_init(buffer, data_type, & my_size_recv(idata), my_size_recv(imeta), & data_memory_type=memtype_mpi_buffer) buffer%grp = grp ! ! Set send and recv buffers sizes and displacements for each proc. ! Accumulate over images and threads. ! Here displacement starts at one. ALLOCATE (send_displs(idata:imeta, 0:nthreads - 1, & nimages, 0:nprocs_total - 1)) ! Displs for local data arrangement, starting at one. ALLOCATE (recv_displs(idata:imeta, 0:nthreads - 1, & nimages, 0:nprocs_total - 1)) ! Here displacement starts at zero. ALLOCATE (send_size_proc(idata:imeta, 0:nprocs_total - 1)) ALLOCATE (recv_size_proc(idata:imeta, 0:nprocs_total - 1)) ALLOCATE (send_displ_proc(idata:imeta, 0:nprocs_total - 1)) ALLOCATE (recv_displ_proc(idata:imeta, 0:nprocs_total - 1)) my_size_send(:) = 1 my_size_recv(:) = 1 DO dst_proc = 0, nprocs_total - 1 send_displ_proc(:, dst_proc) = my_size_send(:) - 1 recv_displ_proc(:, dst_proc) = my_size_recv(:) - 1 ! Avoid communication of local data IF (dst_proc .NE. mynode) THEN DO ui = 1, nimages DO it = 0, nthreads - 1 send_displs(:, it, ui, dst_proc) = my_size_send(:) recv_displs(:, it, ui, dst_proc) = my_size_recv(:) my_size_send(:) = my_size_send(:) + send_sizes(:, it, ui, dst_proc) my_size_recv(:) = my_size_recv(:) + recv_sizes(:, it, ui, dst_proc) END DO END DO ELSE ! Reset all send_displs(:, :, :, dst_proc) = 0 recv_displs(:, :, :, dst_proc) = 0 END IF send_size_proc(:, dst_proc) = my_size_send(:) - send_displ_proc(:, dst_proc) - 1 recv_size_proc(:, dst_proc) = my_size_recv(:) - recv_displ_proc(:, dst_proc) - 1 END DO ! ! Allocate data/meta to send IF (dbcsr_data_valid(make_buffers_data_send)) THEN IF (dbcsr_data_get_type(make_buffers_data_send) .NE. data_type) THEN CALL dbcsr_data_release(make_buffers_data_send) END IF END IF IF (dbcsr_data_valid(make_buffers_data_send)) THEN CALL dbcsr_data_ensure_size(make_buffers_data_send, my_size_send(idata) - 1, nocopy=.TRUE.) ELSE CALL dbcsr_data_init(make_buffers_data_send) CALL dbcsr_data_new(make_buffers_data_send, data_type, my_size_send(idata) - 1, & memory_type=memtype_mpi_buffer) END IF CALL ensure_array_size(make_buffers_meta_send, ub=my_size_send(imeta) - 1, & nocopy=.TRUE., memory_type=memtype_mpi_buffer) ! Displs for data offset ALLOCATE (offset_threads(idata:imeta, 0:nthreads - 1, nimages)) offset_threads(:, :, :) = 0 ! Set offset for local data ALLOCATE (offset_data(0:nthreads - 1, nimages, 0:nprocs_total - 1)) offset_data(:, :, :) = 1 ! Evaluate local displs DO ui = 1, nimages IF (local_images_size(imeta, ui) .EQ. 0) CYCLE offset_threads(:, 0, ui) = 0 DO it = 1, nthreads - 1 offset_threads(:, it, ui) = offset_threads(:, it - 1, ui) DO dst_proc = 0, nprocs_total - 1 offset_threads(:, it, ui) = offset_threads(:, it, ui) + & recv_sizes(:, it - 1, ui, dst_proc) END DO END DO ! Fill meta indices for threads !$ IF (is_left) THEN !$ buffer%meta(local_images_displ(imeta, ui) + 1:local_images_displ(imeta, ui) + nthreads) = & !$ offset_threads(imeta, :, ui)/3 !$ buffer%meta(local_images_displ(imeta, ui) + size_index) = & !$ (local_images_size(imeta, ui) - size_index)/3 !$ END IF offset_threads(imeta, :, ui) = offset_threads(imeta, :, ui) + local_images_displ(imeta, ui) + size_index + 1 send_displs(:, :, ui, mynode) = offset_threads(:, :, ui) ! ! Allow ordering by proc for insertion DO dst_proc = 0, mynode - 1 DO it = 0, nthreads - 1 send_displs(:, it, ui, mynode) = send_displs(:, it, ui, mynode) + & recv_sizes(:, it, ui, dst_proc) END DO END DO offset_data(:, ui, mynode) = send_displs(idata, :, ui, mynode) + 1 send_displs(idata, :, ui, mynode) = send_displs(idata, :, ui, mynode) + local_images_displ(idata, ui) + 1 END DO !$OMP END MASTER !$OMP BARRIER ! IF (do_part_crop_row .OR. do_part_crop_col) THEN CALL dbcsr_data_init(data_block) CALL dbcsr_data_new(data_block, dbcsr_type_1d_to_2d(data_type)) END IF ! ! Copy data and meta in the buffers CALL timeset(routineN//"_pack", handle2) CALL dbcsr_iterator_start(iter, matrix, shared=.TRUE.) DO WHILE (dbcsr_iterator_blocks_left(iter)) CALL dbcsr_iterator_next_block(iter, row, col, blk, blk_p=blk_p, & row_size=row_size, col_size=col_size) nze = row_size*col_size IF (nze .EQ. 0) CYCLE bp = ABS(blk_p) DO symmetry_i = 1, nsymmetries IF (symmetry_i .EQ. 1) THEN stored_row = row; stored_col = col; tr = blk_p .LT. 0 tr_row_size = col_size; tr_col_size = row_size ELSE IF (row .EQ. col) CYCLE stored_row = col; stored_col = row; tr = blk_p .GT. 0 tr_row_size = row_size; tr_col_size = col_size END IF ! Apply cropping IF (do_crop) THEN IF (stored_row .LT. block_row_bounds(1)) CYCLE IF (stored_row .GT. block_row_bounds(2)) CYCLE IF (stored_col .LT. block_col_bounds(1)) CYCLE IF (stored_col .GT. block_col_bounds(2)) CYCLE END IF row_img = row_img_dist(stored_row) col_img = col_img_dist(stored_col) CALL image_calculator(imgdist, & prow=prow, pcol=pcol, & rowi=rowi, coli=coli, & myprow=row_dist(stored_row), myrowi=row_img, & mypcol=col_dist(stored_col), mycoli=col_img, & shifting='0') dst_proc = blacs2mpi(prow, pcol) IF (dst_proc .EQ. mynode) THEN data_buffer_p => buffer%data meta_buffer_p => buffer%meta ELSE data_buffer_p => make_buffers_data_send meta_buffer_p => make_buffers_meta_send END IF !$ IF (is_left .AND. do_symmetry) THEN !$ myt = threads_dist(stored_row) !$ call omp_set_lock(locks(myt)) !$ END IF IF (tr) THEN CALL dbcsr_block_transpose_aa(data_buffer_p, sm%data_area, tr_row_size, tr_col_size, & send_displs(idata, myt, ui, dst_proc), bp, & scale_value) IF (sm%negate_real .AND. sm%negate_imaginary) THEN CALL dbcsr_block_scale(data_buffer_p, scale=scale_neg_one, & row_size=nze, col_size=1, & lb=send_displs(idata, myt, ui, dst_proc)) ELSEIF (sm%negate_real) THEN CALL dbcsr_block_real_neg(data_buffer_p, row_size=nze, col_size=1, & lb=send_displs(idata, myt, ui, dst_proc)) ELSEIF (sm%negate_imaginary) THEN CALL dbcsr_block_conjg(data_buffer_p, row_size=nze, col_size=1, & lb=send_displs(idata, myt, ui, dst_proc)) END IF ELSE CALL dbcsr_block_copy_aa(data_buffer_p, sm%data_area, row_size, col_size, & send_displs(idata, myt, ui, dst_proc), bp, & scale_value) END IF ! ! Apply cropping for partial blocks IF (do_part_crop_row .OR. do_part_crop_col) THEN CALL dbcsr_data_set_pointer( & area=data_block, & rsize=row_size, & csize=col_size, & pointee=data_buffer_p, & source_lb=send_displs(idata, myt, ui, dst_proc)) IF (do_part_crop_row) THEN IF (do_part_crop_f_row .AND. stored_row .EQ. block_row_bounds(1)) THEN CALL dbcsr_data_clear(data_block, ub=f_row_f) END IF IF (do_part_crop_l_row .AND. stored_row .EQ. block_row_bounds(2)) THEN CALL dbcsr_data_clear(data_block, lb=l_row_l) END IF END IF IF (do_part_crop_col) THEN IF (do_part_crop_f_col .AND. stored_col .EQ. block_col_bounds(1)) THEN CALL dbcsr_data_clear(data_block, ub2=f_col_f) END IF IF (do_part_crop_l_col .AND. stored_col .EQ. block_col_bounds(2)) THEN CALL dbcsr_data_clear(data_block, lb2=l_col_l) END IF END IF END IF ! ! Set meta data (global or local indexing) IF (dst_proc .EQ. mynode) THEN stored_row = local_g2l_map_rows(stored_row) stored_col = local_g2l_map_cols(stored_col) ! Count the maximum possible multiplies per row for on-the-fly filtering IF (is_left .AND. otf_filtering) THEN left_total_row_counts(stored_row) = & left_total_row_counts(stored_row) + 1 END IF END IF meta_buffer_p(send_displs(imeta, myt, ui, dst_proc)) = stored_row meta_buffer_p(send_displs(imeta, myt, ui, dst_proc) + 1) = stored_col meta_buffer_p(send_displs(imeta, myt, ui, dst_proc) + 2) = offset_data(myt, ui, dst_proc) ! send_displs(imeta, myt, ui, dst_proc) = send_displs(imeta, myt, ui, dst_proc) + 3 send_displs(idata, myt, ui, dst_proc) = send_displs(idata, myt, ui, dst_proc) + nze offset_data(myt, ui, dst_proc) = offset_data(myt, ui, dst_proc) + nze !$ IF (is_left .AND. do_symmetry) THEN !$ call omp_unset_lock(locks(myt)) !$ END IF END DO END DO CALL dbcsr_iterator_stop(iter) CALL timestop(handle2) ! IF (do_part_crop_row .OR. do_part_crop_col) THEN CALL dbcsr_data_clear_pointer(data_block) CALL dbcsr_data_release(data_block) END IF ! !$OMP BARRIER !$OMP MASTER ! ! Allocate data/meta to recv IF (dbcsr_data_valid(make_buffers_data_recv)) THEN IF (dbcsr_data_get_type(make_buffers_data_recv) .NE. data_type) THEN CALL dbcsr_data_release(make_buffers_data_recv) END IF END IF IF (dbcsr_data_valid(make_buffers_data_recv)) THEN CALL dbcsr_data_ensure_size(make_buffers_data_recv, my_size_recv(idata) - 1, nocopy=.TRUE.) ELSE CALL dbcsr_data_init(make_buffers_data_recv) CALL dbcsr_data_new(make_buffers_data_recv, data_type, my_size_recv(idata) - 1, & memory_type=memtype_mpi_buffer) END IF CALL ensure_array_size(make_buffers_meta_recv, ub=my_size_recv(imeta) - 1, & nocopy=.TRUE., memory_type=memtype_mpi_buffer) ! Exchange data CALL timeset(routineN//"_data", handle2) CALL hybrid_alltoall_any(make_buffers_data_send, send_size_proc(idata, :), send_displ_proc(idata, :), & make_buffers_data_recv, recv_size_proc(idata, :), recv_displ_proc(idata, :), & mp_obj, & most_ptp=.TRUE., remainder_ptp=.TRUE., no_hybrid=.FALSE.) CALL hybrid_alltoall_i1(make_buffers_meta_send, send_size_proc(imeta, :), send_displ_proc(imeta, :), & make_buffers_meta_recv, recv_size_proc(imeta, :), recv_displ_proc(imeta, :), & mp_obj, & most_ptp=.TRUE., remainder_ptp=.TRUE., no_hybrid=.FALSE.) CALL timestop(handle2) !$OMP END MASTER !$OMP BARRIER !$ IF (is_left .AND. do_symmetry) THEN !$ call omp_destroy_lock(locks(ithread)) !$ END IF ! ! Arrange data in the local buffers in images data_buffer_p => buffer%data meta_buffer_p => buffer%meta DO ui = 1, nimages ! Check for empty images IF (local_images_size(imeta, ui) .EQ. 0) CYCLE DO dst_proc = 0, nprocs_total - 1 IF (recv_sizes(imeta, ithread, ui, dst_proc) .EQ. 0) CYCLE ! Skip local data IF (dst_proc .EQ. mynode) THEN offset_threads(:, ithread, ui) = offset_threads(:, ithread, ui) + & recv_sizes(:, ithread, ui, dst_proc) ELSE ! Copy meta, block by block DO blk = recv_displs(imeta, ithread, ui, dst_proc), & recv_displs(imeta, ithread, ui, dst_proc) + recv_sizes(imeta, ithread, ui, dst_proc) - 1, 3 stored_row = local_g2l_map_rows(make_buffers_meta_recv(blk)) stored_col = local_g2l_map_cols(make_buffers_meta_recv(blk + 1)) meta_buffer_p(offset_threads(imeta, ithread, ui)) = stored_row meta_buffer_p(offset_threads(imeta, ithread, ui) + 1) = stored_col meta_buffer_p(offset_threads(imeta, ithread, ui) + 2) = make_buffers_meta_recv(blk + 2) + & offset_threads(idata, ithread, ui) offset_threads(imeta, ithread, ui) = offset_threads(imeta, ithread, ui) + 3 ! Count the maximum possible multiplies per row for on-the-fly filtering IF (is_left .AND. otf_filtering) THEN !$OMP ATOMIC left_total_row_counts(stored_row) = & left_total_row_counts(stored_row) + 1 END IF END DO ! Copy data CALL dbcsr_data_set(data_buffer_p, & offset_threads(idata, ithread, ui) + local_images_displ(idata, ui) + 1, & recv_sizes(idata, ithread, ui, dst_proc), & make_buffers_data_recv, recv_displs(idata, ithread, ui, dst_proc)) offset_threads(idata, ithread, ui) = offset_threads(idata, ithread, ui) + & recv_sizes(idata, ithread, ui, dst_proc) END IF END DO END DO !$OMP END PARALLEL DEALLOCATE (send_sizes, recv_sizes) DEALLOCATE (send_displs, recv_displs, offset_data, offset_threads) DEALLOCATE (send_size_proc, send_displ_proc, recv_size_proc, recv_displ_proc) ! IF (is_left .AND. otf_filtering) THEN CALL mp_isum(left_total_row_counts, dbcsr_mp_my_row_group(mp_obj), request_count_rows) END IF ! CALL setup_rec_index_images(buffer%meta, img_nblks_rows, img_nblks_cols, & local_images_size(imeta, :), local_images_displ(imeta, :), & size_index, is_left) IF (buffer%has_rma_win) THEN do_win_create(1) = do_win_create(1) .OR. dbcsr_data_exists(buffer%data_before_resize) do_win_create(2) = do_win_create(2) .OR. ASSOCIATED(buffer%meta_before_resize) CALL mp_isum(do_win_create, buffer%subgrp, requests_win_create(irequests)) END IF ! IF (is_left) THEN NULLIFY (local_g2l_map_rows) DEALLOCATE (local_g2l_map_cols) ELSE DEALLOCATE (local_g2l_map_rows) NULLIFY (local_g2l_map_cols) END IF !$ IF (is_left .AND. do_symmetry) THEN !$ DEALLOCATE (locks) !$ END IF ! DEALLOCATE (img_nblks_rows, img_nblks_cols) DEALLOCATE (local_images_displ) ! CALL timestop(handle) END SUBROUTINE make_buffers