Multiplies two DBCSR matrices (experimental MPI algorithm). This algorithm is experimental and it should be not used in production runs.
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
type(dbcsr_imagedistribution_obj), | intent(inout) | :: | imgdist_left | |||
type(dbcsr_imagedistribution_obj), | intent(inout) | :: | imgdist_right | |||
type(dbcsr_type), | intent(in) | :: | matrix_left | |||
type(dbcsr_type), | intent(in) | :: | matrix_right | |||
type(dbcsr_type), | intent(inout), | TARGET | :: | product_matrix |
DBCSR product matrix |
|
logical, | intent(in), | optional | :: | retain_sparsity |
retain the sparsity of the existing product matrix; default is no |
|
real(kind=real_8), | intent(in), | optional | :: | filter_eps | ||
integer(kind=int_8), | intent(out) | :: | flop |
effective flop |
||
logical, | intent(in) | :: | keep_product_data |
SUBROUTINE multiply_3D(imgdist_left, imgdist_right, &
matrix_left, matrix_right, &
product_matrix, &
retain_sparsity, &
filter_eps, flop, keep_product_data)
!! Multiplies two DBCSR matrices (experimental MPI algorithm).
!! This algorithm is experimental and it should be not used in
!! production runs.
TYPE(dbcsr_imagedistribution_obj), INTENT(INOUT) :: imgdist_left, imgdist_right
TYPE(dbcsr_type), INTENT(IN) :: matrix_left, matrix_right
TYPE(dbcsr_type), INTENT(INOUT), TARGET :: product_matrix
!! DBCSR product matrix
LOGICAL, INTENT(IN), OPTIONAL :: retain_sparsity
!! retain the sparsity of the existing product matrix; default is no
REAL(kind=real_8), INTENT(IN), OPTIONAL :: filter_eps
INTEGER(KIND=int_8), INTENT(OUT) :: flop
!! effective flop
LOGICAL, INTENT(IN) :: keep_product_data
CHARACTER(len=*), PARAMETER :: routineN = 'multiply_3D'
INTEGER :: blk, data_type, data_type_byte, final_step_k, grp_left, grp_right, handle, &
handle1, handle2, icol3D, icol3D_send, ileft_buffer_calc, ileft_buffer_comm, &
iright_buffer_calc, iright_buffer_comm, irow3D, irow3D_send, istep_k_ordered, ithread, &
ivirt_k, last_step_k, left_col_mult, left_col_nimages, left_col_total_nimages, &
left_max_data_size, left_max_meta_size, left_myfirstvcol, left_myfirstvrow, left_mypcol, &
left_myprow, left_npcols, left_nprows, left_row_mult, left_row_nimages, &
leftovers_first_k, leftovers_k, leftovers_shift_k, leftovers_start_k, min_nimages, &
mycol3D, mypcol, myprow
INTEGER :: myrank3D, myrow3D, myt, nblkrows_local, nbuffers, nbuffers_norms, ncols3D, &
nranks3D, nrows3D, nthreads, numnodes, nvirt_k, proc3D_recv, proc3D_send, recv_vcol, &
recv_vrow, request_epss, request_keep_sparsity, right_col_mult, right_col_nimages, &
right_max_data_size, right_max_meta_size, right_myfirstvcol, right_myfirstvrow, &
right_mypcol, right_myprow, right_npcols, right_nprows, right_row_mult, &
right_row_nimages, right_row_total_nimages, row, shift3D, shift3D_comm, shift3D_recv, &
size_guess, size_guess_init, start_k, start_k_ordered, v_ki
INTEGER(KIND=int_8) :: mem
INTEGER, ALLOCATABLE, DIMENSION(:) :: left_vrow, product_matrix_epss_displ, &
product_matrix_epss_size, product_matrix_meta, product_matrix_size_recv, &
product_matrix_size_send, requests_reduction, right_vcol
INTEGER, ALLOCATABLE, DIMENSION(:, :) :: product_matrix_meta_displ, &
product_matrix_meta_size
INTEGER, DIMENSION(2) :: requests_reduction_size
INTEGER, DIMENSION(:), POINTER, CONTIGUOUS :: col_blk_sizes2enum, enum2col_blk_sizes, &
enum2row_blk_sizes, product_matrix_meta_recv, product_matrix_meta_send, &
row_blk_sizes2enum
INTEGER, DIMENSION(:), POINTER, CONTIGUOUS :: k_sizes
INTEGER, DIMENSION(:, :, :), POINTER, CONTIGUOUS :: left_displ_layers3D, &
left_images_size_layers3D, &
right_displ_layers3D, &
right_images_size_layers3D
INTEGER, DIMENSION(dbcsr_slot_nblkrows_total: &
dbcsr_slot_nfullcols_local) :: left_global_indices, right_global_indices
INTEGER, POINTER :: istep_k_comm
INTEGER, TARGET :: istep_k, istep_k_comm_curr
LOGICAL :: do_layers3D, do_square_layers3D, &
first_k, first_v_k, is_not_comm, &
keep_sparsity, otf_filtering
LOGICAL, ALLOCATABLE, DIMENSION(:) :: do_comm_left, do_comm_right
REAL(kind=sp) :: filter_eps_sp
REAL(kind=sp), ALLOCATABLE, DIMENSION(:), TARGET :: row_max_epss
REAL(kind=sp), ALLOCATABLE, DIMENSION(:, :) :: left_norms, right_norms
REAL(kind=sp), DIMENSION(:), POINTER, CONTIGUOUS :: product_matrix_epss
TYPE(dbcsr_2d_array_obj) :: product_matrix3D
TYPE(dbcsr_buffer), ALLOCATABLE, DIMENSION(:), &
TARGET :: left_buffers, right_buffers
TYPE(dbcsr_buffer), POINTER :: left_buffer_p, right_buffer_p
TYPE(dbcsr_data_obj) :: data_get, data_send
TYPE(dbcsr_mm_multrec_type_p), ALLOCATABLE, &
DIMENSION(:, :, :) :: multrec
TYPE(dbcsr_mp_obj) :: left_mp_obj, product_mp_obj, right_mp_obj
TYPE(mn_local_sizes), ALLOCATABLE, DIMENSION(:) :: m_sizes, n_sizes
CALL timeset(routineN, handle)
!
NULLIFY (row_blk_sizes2enum, enum2row_blk_sizes)
NULLIFY (col_blk_sizes2enum, enum2col_blk_sizes)
NULLIFY (k_sizes)
!
IF (PRESENT(retain_sparsity)) THEN
keep_sparsity = retain_sparsity
ELSE
keep_sparsity = .FALSE.
END IF
otf_filtering = PRESENT(filter_eps)
!
!$OMP PARALLEL DEFAULT (NONE) &
!$OMP SHARED (nthreads)
!$OMP MASTER
nthreads = 1
!$ nthreads = OMP_GET_NUM_THREADS()
!$OMP END MASTER
!$OMP END PARALLEL
!
! Dummy checks
IF (.NOT. ASSOCIATED(product_matrix%wms)) &
DBCSR_ABORT("Work matrices do not exist")
IF (SIZE(product_matrix%wms) .NE. nthreads) &
DBCSR_ABORT("Work matrices not correctly sized.")
IF (.NOT. buffers_win%left%is_valid .OR. &
.NOT. buffers_win%right%is_valid .OR. &
.NOT. ASSOCIATED(buffers_win%left%meta) .OR. &
.NOT. ASSOCIATED(buffers_win%right%meta) .OR. &
.NOT. ASSOCIATED(left_images_size) .OR. &
.NOT. ASSOCIATED(right_images_size) .OR. &
.NOT. ALLOCATED(left_local_images_size) .OR. &
.NOT. ALLOCATED(right_local_images_size)) &
DBCSR_ABORT("No buffers associated for the experimental algo!")
!
! Set up variables
flop = 0
data_type = dbcsr_get_data_type(product_matrix)
data_type_byte = dbcsr_datatype_sizeof(data_type)
left_row_nimages = imgdist_left%i%row_decimation
left_row_mult = imgdist_left%i%row_multiplicity
left_col_nimages = imgdist_left%i%col_decimation
left_col_mult = imgdist_left%i%col_multiplicity
right_row_nimages = imgdist_right%i%row_decimation
right_row_mult = imgdist_right%i%row_multiplicity
right_col_nimages = imgdist_right%i%col_decimation
right_col_mult = imgdist_right%i%col_multiplicity
left_mp_obj = dbcsr_distribution_mp(imgdist_left%i%main)
right_mp_obj = dbcsr_distribution_mp(imgdist_right%i%main)
product_mp_obj = dbcsr_distribution_mp(product_matrix%dist)
numnodes = dbcsr_mp_numnodes(product_mp_obj)
myprow = dbcsr_mp_myprow(product_mp_obj)
mypcol = dbcsr_mp_mypcol(product_mp_obj)
left_nprows = dbcsr_mp_nprows(left_mp_obj)
left_npcols = dbcsr_mp_npcols(left_mp_obj)
left_myprow = dbcsr_mp_myprow(left_mp_obj)
left_mypcol = dbcsr_mp_mypcol(left_mp_obj)
left_myfirstvrow = MOD(left_myprow, layers_3D_C_reduction%side3D)*left_row_nimages
left_myfirstvcol = MOD(left_mypcol, layers_3D_C_reduction%side3D)*left_col_nimages
right_nprows = dbcsr_mp_nprows(right_mp_obj)
right_npcols = dbcsr_mp_npcols(right_mp_obj)
right_myprow = dbcsr_mp_myprow(right_mp_obj)
right_mypcol = dbcsr_mp_mypcol(right_mp_obj)
right_myfirstvrow = MOD(right_myprow, layers_3D_C_reduction%side3D)*right_row_nimages
right_myfirstvcol = MOD(right_mypcol, layers_3D_C_reduction%side3D)*right_col_nimages
left_col_total_nimages = left_npcols*left_col_nimages
right_row_total_nimages = right_nprows*right_row_nimages
grp_right = buffers_win%right%subgrp
grp_left = buffers_win%left%subgrp
!
do_layers3D = layers_3D_C_reduction%num_layers_3D .GT. 1
myrow3D = myprow/layers_3D_C_reduction%side3D + 1
mycol3D = mypcol/layers_3D_C_reduction%side3D + 1
nrows3D = SIZE(left_images_size, 3)
ncols3D = SIZE(right_images_size, 3)
myrank3D = get_rank3D(myprow, mypcol, dbcsr_mp_nprows(product_mp_obj), layers_3D_C_reduction%side3D)
nranks3D = layers_3D_C_reduction%num_layers_3D
myprow = MOD(myprow, layers_3D_C_reduction%side3D)
mypcol = MOD(mypcol, layers_3D_C_reduction%side3D)
!
! Dummy checks
! subcommunicators
IF (.NOT. dbcsr_mp_has_subgroups(right_mp_obj)) &
DBCSR_ABORT("Experimental algorithm requires rows subcommunicators for right matrix!")
IF (.NOT. dbcsr_mp_has_subgroups(left_mp_obj)) &
DBCSR_ABORT("Experimental algorithm requires columns subcommunicators for left matrix!")
! Right col nimages
IF (right_col_nimages .NE. 1) &
DBCSR_ABORT("Col nimages for right matrix is not 1!")
! Left row nimages
IF (left_row_nimages .NE. 1) &
DBCSR_ABORT("Row nimages for left matrix is not 1!")
! left/right matching
IF (left_col_nimages .NE. right_row_mult) &
DBCSR_ABORT("Left/Right image mismatch")
IF (left_col_mult .NE. right_row_nimages) &
DBCSR_ABORT("Left/Right image mismatch")
IF (left_col_nimages*left_npcols .NE. right_row_nimages*right_nprows) &
DBCSR_ABORT("Left/Right total mismatch")
! product/left matching
IF (left_row_mult*dbcsr_mp_nprows(product_mp_obj) .NE. left_nprows) &
DBCSR_ABORT("Product/Left total mismatch")
! product/left matching
IF (right_col_mult*dbcsr_mp_npcols(product_mp_obj) .NE. right_npcols) &
DBCSR_ABORT("Product/Right total mismatch")
! Check sizes from make_buffers
IF (SIZE(left_images_size, 2) .NE. left_col_nimages .OR. &
SIZE(right_images_size, 2) .NE. right_row_nimages) &
DBCSR_ABORT("Mismatch in the sizes")
!
dbcsr_mpi_statistics%nimages = MAX(dbcsr_mpi_statistics%nimages, left_col_nimages)
dbcsr_mpi_statistics%nimages = MAX(dbcsr_mpi_statistics%nimages, right_row_nimages)
!
! The main transfer loop goes through the virtual rows/columns.
! The number of steps may be smaller if the grid dimension is very
! non-optimal (both left column images and right row images are >
! 1).
min_nimages = MIN(left_col_nimages, right_row_nimages)
nvirt_k = left_npcols*left_col_nimages
!
! Check RMA windows creation for original data
CALL win_setup(buffers_win%left, do_win_create_left, requests_win_create(1))
CALL win_setup(buffers_win%right, do_win_create_right, requests_win_create(2))
!
! Count the maximum possible multiplies per row for on-the-fly filtering
ALLOCATE (product_matrix_epss_size(nrows3D), product_matrix_epss_displ(nrows3D))
IF (otf_filtering) THEN
! Wait for counts (sent in make_buffers)
CALL timeset(routineN//"_count_rows", handle1)
CALL mp_wait(request_count_rows)
!
nblkrows_local = SIZE(left_total_row_counts)
ALLOCATE (row_max_epss(nblkrows_local))
filter_eps_sp = REAL(filter_eps, KIND=KIND(row_max_epss))
!$OMP PARALLEL DO DEFAULT (NONE) &
!$OMP SHARED(nblkrows_local,row_max_epss,filter_eps_sp,&
!$OMP left_total_row_counts)
! Determine the maximum per-block epsilon
DO row = 1, nblkrows_local
row_max_epss(row) = &
(filter_eps_sp/REAL(MAX(1, left_total_row_counts(row)), KIND=KIND(row_max_epss)))**2
END DO
!$OMP END PARALLEL DO
DEALLOCATE (left_total_row_counts)
!
IF (do_layers3D .AND. nrows3D .GT. 1) THEN
CALL mp_allgather(SIZE(row_max_epss), &
product_matrix_epss_size, &
layers_3D_C_reduction%rowgrp3D)
size_guess = 0
DO irow3D = 1, nrows3D
product_matrix_epss_displ(irow3D) = size_guess
size_guess = size_guess + product_matrix_epss_size(irow3D)
END DO
ALLOCATE (product_matrix_epss(size_guess))
CALL mp_iallgather(row_max_epss, &
product_matrix_epss, product_matrix_epss_size, product_matrix_epss_displ, &
layers_3D_C_reduction%rowgrp3D, request_epss)
ELSE
product_matrix_epss_size(nrows3D) = SIZE(row_max_epss)
product_matrix_epss_displ(nrows3D) = 0
product_matrix_epss => row_max_epss
END IF
CALL timestop(handle1)
ELSE
product_matrix_epss_size(:) = 0
product_matrix_epss_displ(:) = 0
ALLOCATE (product_matrix_epss(0))
END IF
!
! Exchange 3D meta for C matrix
IF (do_layers3D .AND. keep_sparsity) THEN
ALLOCATE (product_matrix_meta_size(nrows3D, ncols3D))
CALL mp_allgather(product_matrix%index(dbcsr_slot_size), &
product_matrix_meta_size, layers_3D_C_reduction%grp3D)
ALLOCATE (product_matrix_meta_displ(nrows3D, ncols3D))
size_guess = 0
DO icol3D = 1, ncols3D
DO irow3D = 1, nrows3D
product_matrix_meta_displ(irow3D, icol3D) = size_guess
size_guess = size_guess + product_matrix_meta_size(irow3D, icol3D)
END DO
END DO
ALLOCATE (product_matrix_meta(size_guess))
product_matrix%index(dbcsr_slot_nblks) = product_matrix%nblks
product_matrix%index(dbcsr_slot_nze) = product_matrix%nze
CALL mp_iallgather(product_matrix%index(1:product_matrix%index(dbcsr_slot_size)), &
product_matrix_meta, product_matrix_meta_size, product_matrix_meta_displ, &
layers_3D_C_reduction%grp3D, request_keep_sparsity)
END IF
!
! Wait refs and max norms (sent in make_buffers)
CALL timeset(routineN//"_sizes", handle1)
CALL mp_waitall(requests)
CALL timestop(handle1)
DEALLOCATE (right_local_images_size, left_local_images_size)
!
! Needs to remap refs for virtual coordinates 3D
CALL remap_layers3D(left_images_size, left_images_size_layers3D, left_displ_layers3D, &
left_max_data_size, left_max_meta_size)
CALL remap_layers3D(right_images_size, right_images_size_layers3D, right_displ_layers3D, &
right_max_data_size, right_max_meta_size)
left_max_meta_size = left_max_meta_size + dbcsr_num_slots
right_max_meta_size = right_max_meta_size + dbcsr_num_slots
!
do_square_layers3D = .FALSE.
nbuffers_norms = 1
IF (nvirt_k .EQ. 1) THEN
nbuffers = 1
ELSEIF (nrows3D .NE. ncols3D .OR. nranks3D .EQ. 1) THEN
nbuffers = 2
ELSE
! Note that nrows3D==ncols3D >= 2
! Last buffer is used as temporary for communications
nbuffers = nrows3D + 1
nbuffers_norms = nrows3D
do_square_layers3D = .TRUE.
END IF
!
! update capacity of memory-pools
IF (ASSOCIATED(memtype_abpanel_1%pool)) &
CALL dbcsr_mempool_limit_capacity(memtype_abpanel_1%pool, &
capacity=2)
IF (ASSOCIATED(memtype_abpanel_2%pool)) &
CALL dbcsr_mempool_limit_capacity(memtype_abpanel_2%pool, &
capacity=2)
IF (has_acc) THEN
! enumerate the blocksizes to keep the following 2D-arrays small.
CALL enumerate_blk_sizes(matrix_right%row_blk_size%low%data, &
dbcsr_max_row_size(matrix_right), &
row_blk_sizes2enum, enum2row_blk_sizes)
CALL enumerate_blk_sizes(matrix_right%col_blk_size%low%data, &
dbcsr_max_col_size(matrix_right), &
col_blk_sizes2enum, enum2col_blk_sizes)
END IF
IF (nranks3D .GT. 1) THEN
CALL dbcsr_mempool_limit_capacity(memtype_mpi_product%pool, &
capacity=nranks3D - 1)
END IF
!
! Prepare buffers for computation
IF (nvirt_k .GT. 1) THEN
! Right
CALL buffer_init(buffers_2%right, data_type, &
right_max_data_size, &
right_max_meta_size, &
num_data=(nbuffers/2), &
data_memory_type=memtype_abpanel_2, &
trs_memory_type=memtype_trsbuffer_2)
! Left
CALL buffer_init(buffers_2%left, data_type, &
left_max_data_size, &
left_max_meta_size, &
num_data=(nbuffers/2), &
data_memory_type=memtype_abpanel_2)
END IF
!
! Prepare buffers for communication
! Right
CALL buffer_init(buffers_1%right, data_type, &
right_max_data_size, &
right_max_meta_size, &
num_data=(nbuffers - nbuffers/2), &
data_memory_type=memtype_abpanel_1, &
trs_memory_type=memtype_trsbuffer_1)
! Left
CALL buffer_init(buffers_1%left, data_type, &
left_max_data_size, &
left_max_meta_size, &
num_data=(nbuffers - nbuffers/2), &
data_memory_type=memtype_abpanel_1)
!
CALL setup_buffers(buffers_1%right, buffers_2%right, &
right_buffers, nbuffers, &
right_max_data_size, &
right_max_meta_size, &
matrix_right, imgdist_right)
CALL setup_buffers(buffers_1%left, buffers_2%left, &
left_buffers, nbuffers, &
left_max_data_size, &
left_max_meta_size, &
matrix_left, imgdist_left)
!
! Setup the receive data pointers
CALL dbcsr_data_init(data_get)
CALL dbcsr_data_new(data_get, data_type)
IF (do_layers3D) THEN
CALL dbcsr_data_init(data_send)
CALL dbcsr_data_new(data_send, data_type)
! Prepare buffers for 3D reduction
IF (ALLOCATED(layers_3D_C_reduction%data_red3D)) THEN
IF (SIZE(layers_3D_C_reduction%data_red3D) .LT. nthreads .OR. &
layers_3D_C_reduction%data_type .NE. data_type) THEN
DO myt = 1, SIZE(layers_3D_C_reduction%data_red3D)
CALL dbcsr_data_release(layers_3D_C_reduction%data_red3D(myt))
END DO
DEALLOCATE (layers_3D_C_reduction%data_red3D)
layers_3D_C_reduction%data_type = 0
END IF
END IF
IF (.NOT. ALLOCATED(layers_3D_C_reduction%data_red3D)) THEN
ALLOCATE (layers_3D_C_reduction%data_red3D(nthreads))
DO myt = 1, nthreads
CALL dbcsr_data_init(layers_3D_C_reduction%data_red3D(myt))
CALL dbcsr_data_new(layers_3D_C_reduction%data_red3D(myt), data_type)
END DO
layers_3D_C_reduction%data_type = data_type
END IF
ALLOCATE (product_matrix_size_send(nthreads + 1), product_matrix_size_recv(nthreads + 1))
ALLOCATE (requests_reduction((nthreads + 1)*2))
END IF
!
! These values for meta data are used for global values
right_global_indices(dbcsr_slot_nblkrows_total:dbcsr_slot_nfullcols_local) = &
(/ &
dbcsr_nblkrows_total(matrix_right), &
dbcsr_nblkcols_total(matrix_right), &
dbcsr_nfullrows_total(matrix_right), &
dbcsr_nfullcols_total(matrix_right), &
0, 0, &
dbcsr_nfullrows_local(matrix_right), &
dbcsr_nfullcols_local(matrix_right)/)
left_global_indices(dbcsr_slot_nblkrows_total:dbcsr_slot_nfullcols_local) = &
(/ &
dbcsr_nblkrows_total(matrix_left), &
dbcsr_nblkcols_total(matrix_left), &
dbcsr_nfullrows_total(matrix_left), &
dbcsr_nfullcols_total(matrix_left), &
0, 0, &
dbcsr_nfullrows_local(matrix_left), &
dbcsr_nfullcols_local(matrix_left)/)
!
! Evaluate sizes for workspaces
size_guess_init = 1
IF (.NOT. keep_sparsity .AND. has_acc) THEN
size_guess_init = product_matrix_size_guess(matrix_left, matrix_right, product_matrix, &
left_max_data_size, right_max_data_size, &
left_col_nimages, right_row_nimages, &
nthreads)
END IF
!
! Preallocate norms arrays
IF (otf_filtering) THEN
ALLOCATE (right_norms(right_max_meta_size/3, nbuffers_norms))
ALLOCATE (left_norms(left_max_meta_size/3, nbuffers_norms))
IF (do_layers3D .AND. nrows3D .GT. 1) THEN
CALL mp_wait(request_epss)
DEALLOCATE (row_max_epss)
END IF
ELSE
! The array must be valid when passed to called subroutines.
ALLOCATE (right_norms(0, nbuffers_norms))
ALLOCATE (left_norms(0, nbuffers_norms))
END IF
!
IF (do_layers3D .AND. keep_sparsity) CALL mp_wait(request_keep_sparsity)
!
ALLOCATE (product_matrix3D%mats(nrows3D, ncols3D))
DO icol3D = 1, ncols3D
DO irow3D = 1, nrows3D
NULLIFY (product_matrix3D%mats(irow3D, icol3D)%matrix)
END DO
END DO
ALLOCATE (multrec(0:nthreads - 1, nrows3D, ncols3D))
!
! Here is the main loop
! 3D multiplication
!
CALL timeset(routineN//"_loop", handle1)
! Take into account when ticks are not multiple of 3D layers
leftovers_k = MOD(nvirt_k, nranks3D)
leftovers_first_k = leftovers_k*myrank3D
leftovers_start_k = 0
leftovers_shift_k = 0
IF (leftovers_k .GT. 0) THEN
! This is only for nrows3D==ncols3D
leftovers_start_k = (nvirt_k/nrows3D - 1)*(myrank3D/nrows3D) - &
(leftovers_k/nrows3D - 1)*(myrank3D/nrows3D)
leftovers_shift_k = nranks3D*(leftovers_k/nrows3D) - leftovers_k*(MOD(myrank3D, nrows3D) + 1)
END IF
! Ticks bounds
start_k = (nvirt_k/nranks3D)*myrank3D
last_step_k = nvirt_k + leftovers_first_k
final_step_k = last_step_k - nranks3D
! Shift layers to keep local layer as the last one in computation
shift3D = (mycol3D - 1)*nrows3D + &
(nrows3D - myrow3D + 1)*(1 - MOD(mycol3D, 2)) + myrow3D*MOD(mycol3D, 2)
iright_buffer_comm = 0
ileft_buffer_comm = 0
ALLOCATE (do_comm_right(ncols3D), do_comm_left(nrows3D))
ALLOCATE (right_vcol(ncols3D), left_vrow(nrows3D))
ALLOCATE (m_sizes(nrows3D), n_sizes(ncols3D))
irow3D_send = 0
icol3D_send = 0
first_k = .TRUE.
first_v_k = .TRUE.
istep_k_comm_curr = leftovers_first_k
istep_k_comm => istep_k_comm_curr
grouped_steps_index: DO istep_k = leftovers_first_k, last_step_k
!
! Wait data. Exclude the first iteration.
wait: IF (istep_k .GT. leftovers_first_k) THEN
IF (debug_mod) WRITE (*, '(1X,A)') routineN//" waiting for right and left"
right_buffer_p => right_buffers(iright_buffer_calc)
left_buffer_p => left_buffers(ileft_buffer_calc)
IF (right_buffer_p%is_comm .AND. left_buffer_p%is_comm) THEN
! check if right matrix was already initialized
IF (.NOT. right_buffer_p%matrix%valid) THEN
CALL timeset(routineN//"_comm_right", handle2)
CALL mp_waitall(right_buffer_p%get_requests(:))
CALL timestop(handle2)
END IF
! check if left matrix was already initialized
IF (.NOT. left_buffer_p%matrix%valid) THEN
CALL timeset(routineN//"_comm_left", handle2)
CALL mp_waitall(left_buffer_p%get_requests(:))
CALL timestop(handle2)
END IF
END IF
END IF wait
!
! Matrix transfer. Transfer in all but the last loop iteration.
shift3D_comm = shift3D
xfer: DO WHILE (istep_k_comm .LT. last_step_k)
start_k_ordered = start_k
istep_k_ordered = istep_k_comm
! Put leftovers ticks always first
IF (leftovers_k .GT. 0) THEN
IF (istep_k_comm .LT. leftovers_first_k + leftovers_k) THEN
start_k_ordered = leftovers_start_k
ELSE
istep_k_ordered = istep_k_comm + leftovers_shift_k
END IF
END IF
first_k = MOD(istep_k_ordered, nranks3D) .EQ. 0
ivirt_k = istep_k_ordered/nranks3D
IF (istep_k_comm .LT. leftovers_first_k + leftovers_k) THEN
CALL row_col_3D_reflected(irow3D, icol3D, nrows3D, ncols3D, istep_k_ordered)
ELSE
CALL row_col_3D_reflected(irow3D, icol3D, nrows3D, ncols3D, shift3D)
shift3D = shift3D + 1
END IF
!
v_ki = MOD(ivirt_k, min_nimages)
! Reset communication flags at the first layer
IF ((first_k .OR. istep_k_comm .EQ. leftovers_first_k) .AND. &
istep_k_comm .EQ. istep_k_comm_curr) THEN
do_comm_right(:) = .TRUE.
do_comm_left(:) = .TRUE.
END IF
! Take first image global virtual coordinates
IF (v_ki .EQ. 0) THEN
IF (istep_k_comm .GE. leftovers_first_k + leftovers_k) first_v_k = .FALSE.
start_k_ordered = start_k_ordered + ivirt_k
END IF
IF (v_ki .EQ. 0 .OR. (first_v_k .AND. min_nimages .GT. 1)) THEN
CALL image_calculator(imgdist_right, &
vprow=recv_vrow, &
vpcol=right_vcol(icol3D), &
mypcol=mypcol, &
myvprow=right_myfirstvrow, &
myvpcol=right_myfirstvcol + (icol3D - 1)*layers_3D_C_reduction%side3D, &
vprow_shift=start_k_ordered, &
shifting='R')
CALL image_calculator(imgdist_left, &
vprow=left_vrow(irow3D), &
vpcol=recv_vcol, &
myprow=myprow, &
myvprow=left_myfirstvrow + (irow3D - 1)*layers_3D_C_reduction%side3D, &
myvpcol=left_myfirstvcol, &
vpcol_shift=start_k_ordered, &
shifting='L')
END IF
!
! Set coordinates
IF (do_square_layers3D) THEN
! Use the temporary buffers for the communication of the first tick
IF (first_k) THEN
iright_buffer_comm = nbuffers
ileft_buffer_comm = nbuffers
ELSE
iright_buffer_comm = icol3D
ileft_buffer_comm = irow3D
END IF
ELSE
IF (do_comm_right(icol3D)) THEN
iright_buffer_comm = MOD(iright_buffer_comm, nbuffers) + 1
END IF
IF (do_comm_left(irow3D)) THEN
ileft_buffer_comm = MOD(ileft_buffer_comm, nbuffers) + 1
END IF
END IF
!
! Exit if data are already communicated
IF (istep_k_comm .NE. istep_k_comm_curr) EXIT
!
right_buffer_p => right_buffers(iright_buffer_comm)
left_buffer_p => left_buffers(ileft_buffer_comm)
right_buffer_p%coord3D = icol3D
left_buffer_p%coord3D = irow3D
!
! First row, communicate right matrix
IF (do_comm_right(icol3D)) THEN
right_buffer_p%vprow = MOD(recv_vrow + v_ki, right_row_total_nimages)
right_buffer_p%vpcol = right_vcol(icol3D)
right_buffer_p%is_comm = .FALSE.
END IF
!
is_not_comm = .TRUE.
IF (right_images_size_layers3D(imeta, icol3D, right_buffer_p%vprow) .NE. 0) THEN
! First col, communicate left matrix
IF (do_comm_left(irow3D)) THEN
left_buffer_p%vprow = left_vrow(irow3D)
left_buffer_p%vpcol = MOD(recv_vcol + v_ki, left_col_total_nimages)
left_buffer_p%is_comm = .FALSE.
END IF
!
IF (left_images_size_layers3D(imeta, irow3D, left_buffer_p%vpcol) .NE. 0) THEN
! Check if data is already communicated
is_not_comm = do_comm_right(icol3D) .OR. do_comm_left(irow3D)
IF (is_not_comm) THEN
! Right
IF (do_comm_right(icol3D)) THEN
IF (has_acc) THEN
CALL timeset(routineN//"_acc_sync_right", handle2)
CALL acc_event_synchronize(right_buffer_p%data%d%acc_ready)
CALL timestop(handle2)
END IF
!
do_comm_right(icol3D) = .FALSE.
CALL rma_transfer(right_buffer_p%vprow, right_row_nimages, &
right_images_size_layers3D(:, icol3D, right_buffer_p%vprow), &
right_displ_layers3D(:, icol3D, right_buffer_p%vprow), &
right_buffer_p, &
buffers_win%right%meta_win, buffers_win%right%data_win, &
data_get, data_type_byte, buffers_win%right, icol3D, ncols3D)
END IF
! Left
IF (do_comm_left(irow3D)) THEN
IF (has_acc) THEN
CALL timeset(routineN//"_acc_sync_left", handle2)
CALL acc_event_synchronize(left_buffer_p%data%d%acc_ready)
CALL timestop(handle2)
END IF
!
do_comm_left(irow3D) = .FALSE.
CALL rma_transfer(left_buffer_p%vpcol, left_col_nimages, &
left_images_size_layers3D(:, irow3D, left_buffer_p%vpcol), &
left_displ_layers3D(:, irow3D, left_buffer_p%vpcol), &
left_buffer_p, &
buffers_win%left%meta_win, buffers_win%left%data_win, &
data_get, data_type_byte, buffers_win%left, irow3D, nrows3D)
END IF
END IF
END IF
END IF
!
istep_k_comm_curr = istep_k_comm_curr + 1
! Stop looping when data is communicated
! Only works for 4 layers
IF (is_not_comm .OR. nranks3D .NE. 4) THEN
istep_k_comm => istep_k
IF ((istep_k_comm + 1) .EQ. istep_k_comm_curr) EXIT
! Restore coordinates by looping once again
shift3D = shift3D_comm
CYCLE
END IF
! Keep looping until it starts a new communication (only for 4 layers)
istep_k_comm => istep_k_comm_curr
END DO xfer
!
! Create matrices and multrec's, only the first occurrence
IF (.NOT. ASSOCIATED(product_matrix3D%mats(irow3D, icol3D)%matrix)) THEN
IF (irow3D .EQ. myrow3D .AND. icol3D .EQ. mycol3D) THEN
product_matrix3D%mats(irow3D, icol3D)%matrix => product_matrix
ELSE
ALLOCATE (product_matrix3D%mats(irow3D, icol3D)%matrix)
IF (keep_sparsity) THEN
size_guess = product_matrix_meta(product_matrix_meta_displ(irow3D, icol3D) + &
dbcsr_slot_nze)
CALL setup_buffer_matrix(product_matrix3D%mats(irow3D, icol3D)%matrix, &
product_matrix, product_matrix_meta_size(irow3D, icol3D), &
data_size=size_guess, &
data_memory_type=memtype_mpi_product)
product_matrix3D%mats(irow3D, icol3D)% &
matrix%index(1:product_matrix_meta_size(irow3D, icol3D)) = &
product_matrix_meta(product_matrix_meta_displ(irow3D, icol3D) + 1: &
product_matrix_meta_displ(irow3D, icol3D) + &
product_matrix_meta_size(irow3D, icol3D))
CALL dbcsr_data_clear(product_matrix3D%mats(irow3D, icol3D)%matrix%data_area, &
ub=size_guess)
ELSE
CALL setup_buffer_matrix(product_matrix3D%mats(irow3D, icol3D)%matrix, &
product_matrix, data_memory_type=memtype_mpi_product)
END IF
product_matrix3D%mats(irow3D, icol3D)%matrix%index(dbcsr_slot_home_prow) = &
(irow3D - 1)*layers_3D_C_reduction%side3D + myprow
product_matrix3D%mats(irow3D, icol3D)%matrix%index(dbcsr_slot_home_pcol) = &
(icol3D - 1)*layers_3D_C_reduction%side3D + mypcol
CALL dbcsr_reset_locals(product_matrix3D%mats(irow3D, icol3D)%matrix)
product_matrix3D%mats(irow3D, icol3D)%matrix%nblks = 0
CALL dbcsr_repoint_index(product_matrix3D%mats(irow3D, icol3D)%matrix)
END IF
!
IF (.NOT. ASSOCIATED(m_sizes(irow3D)%sizes)) THEN
ALLOCATE (m_sizes(irow3D)%sizes(dbcsr_nblkrows_local(product_matrix3D%mats(irow3D, icol3D)%matrix)))
CALL local_filter(array_data(product_matrix3D%mats(irow3D, icol3D)%matrix%row_blk_size), &
array_size(product_matrix3D%mats(irow3D, icol3D)%matrix%local_rows), &
array_data(product_matrix3D%mats(irow3D, icol3D)%matrix%local_rows), &
m_sizes(irow3D)%sizes)
END IF
IF (.NOT. ASSOCIATED(n_sizes(icol3D)%sizes)) THEN
ALLOCATE (n_sizes(icol3D)%sizes(dbcsr_nblkcols_local(product_matrix3D%mats(irow3D, icol3D)%matrix)))
CALL local_filter(array_data(product_matrix3D%mats(irow3D, icol3D)%matrix%col_blk_size), &
array_size(product_matrix3D%mats(irow3D, icol3D)%matrix%local_cols), &
array_data(product_matrix3D%mats(irow3D, icol3D)%matrix%local_cols), &
n_sizes(icol3D)%sizes)
END IF
!
!$OMP PARALLEL DEFAULT(NONE) &
!$OMP PRIVATE (size_guess, ithread) &
!$OMP SHARED (product_matrix3D, multrec, &
!$OMP keep_sparsity, filter_eps, &
!$OMP product_matrix_epss, &
!$OMP matrix_right, matrix_left, nthreads, &
!$OMP irow3D, icol3D, myrow3D, mycol3D, keep_product_data, &
!$OMP product_matrix_epss_displ, product_matrix_epss_size, &
!$OMP memtype_product_wm, size_guess_init, nranks3D, m_sizes, n_sizes)
!
! Setup product work areas
!
ithread = 0
!$ ithread = OMP_GET_THREAD_NUM()
!
IF (irow3D .NE. myrow3D .OR. icol3D .NE. mycol3D) THEN
IF (keep_product_data) THEN
CALL dbcsr_add_wm_from_matrix(product_matrix3D%mats(irow3D, icol3D)%matrix)
ELSE
CALL dbcsr_work_create(product_matrix3D%mats(irow3D, icol3D)%matrix, &
work_mutable=.FALSE., memory_type=memtype_product_wm(ithread)%p)
END IF
!$OMP BARRIER
END IF
! The work arrays have to be setup
size_guess = product_matrix3D%mats(irow3D, icol3D)%matrix%wms(ithread + 1)%datasize ! Should be minimal
IF (.NOT. keep_sparsity) THEN
size_guess = MAX(size_guess, size_guess_init)
END IF
CALL dbcsr_data_ensure_size(product_matrix3D%mats(irow3D, icol3D)% &
matrix%wms(ithread + 1)%data_area, &
size_guess)
CALL dbcsr_data_set_size_referenced(product_matrix3D%mats(irow3D, icol3D)% &
matrix%wms(ithread + 1)%data_area, &
product_matrix3D%mats(irow3D, icol3D)% &
matrix%wms(ithread + 1)%datasize)
CALL ensure_array_size(product_matrix3D%mats(irow3D, icol3D)% &
matrix%wms(ithread + 1)%row_i, ub=1)
CALL ensure_array_size(product_matrix3D%mats(irow3D, icol3D)% &
matrix%wms(ithread + 1)%col_i, ub=1)
CALL ensure_array_size(product_matrix3D%mats(irow3D, icol3D)% &
matrix%wms(ithread + 1)%blk_p, ub=1)
ALLOCATE (multrec(ithread, irow3D, icol3D)%p)
CALL dbcsr_mm_multrec_init(multrec(ithread, irow3D, icol3D)%p, &
product=product_matrix3D%mats(irow3D, icol3D)%matrix, &
keep_sparsity=keep_sparsity, &
eps=filter_eps, &
row_max_epss=product_matrix_epss(product_matrix_epss_displ(irow3D) + 1: &
product_matrix_epss_displ(irow3D) + &
product_matrix_epss_size(irow3D)), &
block_estimate=0, &
right_row_blk_size=dbcsr_row_block_sizes(matrix_right), &
m_sizes=m_sizes(irow3D)%sizes, n_sizes=n_sizes(icol3D)%sizes, &
nlayers=nranks3D, &
keep_product_data=keep_product_data)
!$OMP END PARALLEL
!
product_matrix3D%mats(irow3D, icol3D)%matrix%nblks = 0
product_matrix3D%mats(irow3D, icol3D)%matrix%nze = 0
product_matrix3D%mats(irow3D, icol3D)%matrix%row_p(:) = 0
CALL dbcsr_data_set_size_referenced(product_matrix3D%mats(irow3D, icol3D)%matrix%data_area, 0)
product_matrix3D%mats(irow3D, icol3D)%matrix%valid = .FALSE.
END IF
!
! Do the multiplication. Exclude the first iteration.
calc: IF (istep_k .GT. leftovers_first_k) THEN
right_buffer_p => right_buffers(iright_buffer_calc)
left_buffer_p => left_buffers(ileft_buffer_calc)
irow3D = left_buffer_p%coord3D
icol3D = right_buffer_p%coord3D
IF (istep_k .GT. final_step_k) THEN
!$OMP PARALLEL DEFAULT (NONE) &
!$OMP SHARED (multrec, irow3D, icol3D, irow3D_send, icol3D_send, &
!$OMP istep_k, final_step_k, product_matrix3D, &
!$OMP handle2, requests_reduction_size, nthreads, &
!$OMP product_matrix_meta_send, product_matrix_meta_recv, &
!$OMP product_matrix_size_send, product_matrix_size_recv, &
!$OMP buffers_win, data_send, data_get, proc3D_send, proc3D_recv, &
!$OMP layers_3D_C_reduction, requests_reduction, &
!$OMP dbcsr_mpi_statistics, data_type_byte) &
!$OMP PRIVATE (ithread)
ithread = 0
!$ ithread = omp_get_thread_num()
! Prepare data to send for 3D layer
IF (istep_k .GT. final_step_k + 1) THEN
CALL dbcsr_mm_multrec_finalize( &
multrec(ithread, irow3D_send, icol3D_send)%p, &
buffers_win%left%meta_red3D)
!$OMP BARRIER
!$OMP MASTER
CALL timeset(routineN//"_red3D_size", handle2)
CALL mp_waitall(requests_reduction_size)
CALL timestop(handle2)
CALL ensure_array_size(buffers_win%right%meta_red3D, &
ub=product_matrix_size_recv(1), &
nocopy=.TRUE.)
product_matrix_meta_send => &
buffers_win%left%meta_red3D(1:product_matrix_size_send(1))
product_matrix_meta_recv => &
buffers_win%right%meta_red3D(1:product_matrix_size_recv(1))
CALL mp_isendrecv(product_matrix_meta_send, proc3D_send, &
product_matrix_meta_recv, proc3D_recv, &
layers_3D_C_reduction%grp3D, &
requests_reduction(1), requests_reduction(2))
DO myt = 1, nthreads
CALL dbcsr_data_ensure_size(layers_3D_C_reduction%data_red3D(myt), &
product_matrix_size_recv(myt + 1), &
nocopy=.TRUE.)
CALL dbcsr_data_set_pointer( &
area=data_send, &
rsize=product_matrix_size_send(myt + 1), &
csize=1, &
pointee=product_matrix3D%mats(irow3D_send, icol3D_send)%matrix%wms(myt)%data_area)
CALL dbcsr_data_set_pointer( &
area=data_get, &
rsize=product_matrix_size_recv(myt + 1), &
csize=1, &
pointee=layers_3D_C_reduction%data_red3D(myt))
CALL dbcsr_isendrecv_any(data_send, proc3D_send, &
data_get, proc3D_recv, &
layers_3D_C_reduction%grp3D, &
requests_reduction(3 + (myt - 1)*2), &
requests_reduction(4 + (myt - 1)*2))
CALL count_mpi_statistics(dbcsr_mpi_statistics%data_size(1, :), &
product_matrix_size_send(myt + 1), &
data_type_byte, &
dbcsr_mpi_statistics%data_size_breakdown(:, :, 1))
END DO
!$OMP END MASTER
END IF
!$OMP END PARALLEL
END IF
!
IF (right_buffer_p%is_comm .AND. left_buffer_p%is_comm) THEN
iright_buffer_calc = MIN(iright_buffer_calc, nbuffers_norms)
ileft_buffer_calc = MIN(ileft_buffer_calc, nbuffers_norms)
! check if right matrix was already initialized
IF (.NOT. right_buffer_p%matrix%valid) THEN
IF (has_acc) CALL dbcsr_data_host2dev(right_buffer_p%data)
! Repoint indices of matrices
CALL make_meta(right_buffer_p, &
right_row_total_nimages, &
right_buffer_p%vprow, &
right_buffer_p%vpcol, &
imgdist=imgdist_right, do_merge_rows=.FALSE., &
global_indices=right_global_indices)
CALL ensure_array_size(k_sizes, ub=array_size(right_buffer_p%matrix%local_rows))
CALL local_filter(array_data(right_buffer_p%matrix%row_blk_size), &
array_size(right_buffer_p%matrix%local_rows), &
array_data(right_buffer_p%matrix%local_rows), &
k_sizes)
IF (otf_filtering) THEN
CALL calculate_norms(right_buffer_p%matrix, &
right_norms(:, iright_buffer_calc), &
k_sizes, n_sizes(icol3D)%sizes)
END IF
IF (has_acc) THEN
CALL acc_transpose_blocks(right_buffer_p%matrix, &
right_buffer_p%trs_stackbuf, &
k_sizes, n_sizes(icol3D)%sizes, &
row_blk_sizes2enum, enum2row_blk_sizes, &
col_blk_sizes2enum, enum2col_blk_sizes, &
noresize=.TRUE.)
END IF
END IF
! check if left matrix was already initialized
IF (.NOT. left_buffer_p%matrix%valid) THEN
IF (has_acc) CALL dbcsr_data_host2dev(left_buffer_p%data)
! Repoint indices of matrices
CALL make_meta(left_buffer_p, &
left_col_total_nimages, &
left_buffer_p%vprow, &
left_buffer_p%vpcol, &
imgdist=imgdist_left, do_merge_rows=.TRUE., &
global_indices=left_global_indices, &
nthreads=nthreads)
IF (otf_filtering) THEN
CALL calculate_norms(left_buffer_p%matrix, &
left_norms(:, ileft_buffer_calc), &
m_sizes(irow3D)%sizes, k_sizes)
END IF
END IF
! Wait for left and right buffers transfer to device before proceeding
IF (has_acc) THEN
CALL timeset(routineN//"_sync_h2d", handle2)
CALL acc_device_synchronize()
CALL timestop(handle2)
END IF
!
CALL timeset(routineN//"_multrec", handle2)
!$OMP PARALLEL DEFAULT (NONE) &
!$OMP SHARED (left_buffer_p, ileft_buffer_calc, &
!$OMP right_buffer_p, iright_buffer_calc, &
!$OMP left_norms,right_norms, &
!$OMP multrec, irow3D, icol3D, handle2, k_sizes) &
!$OMP PRIVATE (ithread) &
!$OMP REDUCTION (+: flop)
ithread = 0
!$ ithread = omp_get_thread_num()
CALL dbcsr_mm_multrec_multiply(multrec(ithread, irow3D, icol3D)%p, &
left=left_buffer_p%matrix, &
right=right_buffer_p%matrix, &
flop=flop, &
a_norms=left_norms(:, ileft_buffer_calc), &
b_norms=right_norms(:, iright_buffer_calc), &
k_sizes=k_sizes)
!$OMP END PARALLEL
CALL timestop(handle2)
END IF
! Reduce 3D layers and finalize the local layer
IF (istep_k .GT. final_step_k) THEN
! Wait for the other 3D layers to reduce
IF (istep_k .GT. final_step_k + 1) THEN
CALL timeset(routineN//"_red3D_data", handle2)
CALL mp_waitall(requests_reduction)
CALL timestop(handle2)
DO myt = 0, nthreads - 1
DEALLOCATE (multrec(myt, irow3D_send, icol3D_send)%p)
CALL dbcsr_work_destroy( &
product_matrix3D%mats(irow3D_send, icol3D_send)%matrix%wms(myt + 1))
END DO
DEALLOCATE (product_matrix3D%mats(irow3D_send, icol3D_send)%matrix%wms)
CALL dbcsr_release(product_matrix3D%mats(irow3D_send, icol3D_send)%matrix)
END IF
irow3D_send = irow3D
icol3D_send = icol3D
! Store the initial shift for the recv node
IF (istep_k .EQ. final_step_k + 1) THEN
shift3D_recv = shift3D - 4
END IF
!$OMP PARALLEL DEFAULT (NONE) &
!$OMP SHARED (multrec, irow3D, icol3D, product_matrix3D, &
!$OMP memtype_mpi_buffer, nthreads, myt, istep_k, &
!$OMP irow3D_send, icol3D_send, myrow3D, mycol3D, &
!$OMP last_step_k, proc3D_send, proc3D_recv, &
!$OMP product_matrix_size_send, product_matrix_size_recv, &
!$OMP nrows3D, ncols3D, shift3D_recv, myrank3D, &
!$OMP layers_3D_C_reduction, requests_reduction_size, &
!$OMP final_step_k, handle2, buffers_win, g2l_map_rows, g2l_map_cols) &
!$OMP PRIVATE (ithread) &
!$OMP REDUCTION (+: flop)
ithread = 0
!$ ithread = omp_get_thread_num()
!
! Evaluate the size of layers to send and set the buffers
IF (irow3D .NE. myrow3D .OR. &
icol3D .NE. mycol3D) THEN
CALL dbcsr_mm_multrec_dev2host_init(multrec(ithread, irow3D, icol3D)%p)
!$OMP ATOMIC
product_matrix3D%mats(irow3D_send, icol3D_send)%matrix%nblks = &
product_matrix3D%mats(irow3D_send, icol3D_send)%matrix%nblks + &
dbcsr_mm_multrec_get_nblks(multrec(ithread, irow3D_send, icol3D_send)%p)
!$OMP BARRIER
!$OMP MASTER
! First (nthreads+1) positions are reserved for
! the offset sizes of each thread for meta
CALL ensure_array_size(buffers_win%left%meta_red3D, &
ub=product_matrix3D%mats(irow3D_send, icol3D_send)% &
matrix%nblks*3 + (nthreads + 1), &
nocopy=.TRUE.)
! Set the offsets
buffers_win%left%meta_red3D(1) = nthreads + 1
DO myt = 0, nthreads - 1
buffers_win%left%meta_red3D(myt + 2) = &
buffers_win%left%meta_red3D(myt + 1) + &
dbcsr_mm_multrec_get_nblks(multrec(myt, irow3D_send, icol3D_send)%p)*3
product_matrix_size_send(myt + 2) = &
dbcsr_mm_multrec_get_nze(multrec(myt, irow3D_send, icol3D_send)%p)
END DO
! Send/recv data and meta sizes
product_matrix_size_send(1) = &
buffers_win%left%meta_red3D(nthreads + 1)
proc3D_send = (icol3D_send - 1)*nrows3D + irow3D_send - 1
!
CALL row_col_3D_reflected(irow3D, icol3D, nrows3D, ncols3D, shift3D_recv)
shift3D_recv = shift3D_recv - 1
proc3D_recv = (icol3D - 1)*nrows3D + irow3D - 1
CALL mp_isendrecv(product_matrix_size_send, proc3D_send, &
product_matrix_size_recv, proc3D_recv, &
layers_3D_C_reduction%grp3D, &
requests_reduction_size(1), &
requests_reduction_size(2))
!$OMP END MASTER
ELSE
IF (istep_k .NE. last_step_k) &
DBCSR_ABORT("Last layer does not correspond to local layer")
END IF
! Reduce to the local layer
IF (istep_k .GT. final_step_k + 1) THEN
IF (dbcsr_data_get_size_referenced(layers_3D_C_reduction%data_red3D(ithread + 1)) .GT. 0) THEN
CALL timeset(routineN//"_red3D", handle2)
CALL dbcsr_mm_multrec_red3D(multrec(ithread, myrow3D, mycol3D)%p, &
buffers_win%right%meta_red3D, &
layers_3D_C_reduction%data_red3D(ithread + 1), &
flop, g2l_map_rows, g2l_map_cols)
CALL timestop(handle2)
END IF
END IF
!$OMP END PARALLEL
END IF
END IF calc
!
! Swap temporary buffers for the first tick
IF (do_square_layers3D .AND. first_k .AND. &
istep_k .LT. last_step_k) THEN
iright_buffer_comm = right_buffers(iright_buffer_comm)%coord3D
ileft_buffer_comm = left_buffers(ileft_buffer_comm)%coord3D
CALL swap_buffers(right_buffers(iright_buffer_comm), right_buffers(nbuffers))
CALL swap_buffers(left_buffers(ileft_buffer_comm), left_buffers(nbuffers))
END IF
!
iright_buffer_calc = iright_buffer_comm
ileft_buffer_calc = ileft_buffer_comm
END DO grouped_steps_index
!
CALL timestop(handle1)
!
CALL m_memory(mem)
max_memory = MAX(max_memory, REAL(mem))
!
IF (do_layers3D .AND. keep_sparsity) THEN
DEALLOCATE (product_matrix_meta_size, product_matrix_meta_displ)
DEALLOCATE (product_matrix_meta)
END IF
DEALLOCATE (right_norms, left_norms)
DEALLOCATE (product_matrix_epss_size, product_matrix_epss_displ)
IF (.NOT. otf_filtering .OR. (do_layers3D .AND. nrows3D .GT. 1)) THEN
DEALLOCATE (product_matrix_epss)
ELSE
DEALLOCATE (row_max_epss)
END IF
!
DEALLOCATE (left_images_size, right_images_size)
NULLIFY (left_images_size, right_images_size)
DEALLOCATE (left_images_size_layers3D, left_displ_layers3D)
DEALLOCATE (right_images_size_layers3D, right_displ_layers3D)
!
! Deallocate 3D layers
IF (do_layers3D) THEN
DEALLOCATE (product_matrix_size_send, product_matrix_size_recv)
DEALLOCATE (requests_reduction)
DO icol3D = 1, ncols3D
DO irow3D = 1, nrows3D
IF (irow3D .NE. myrow3D .OR. icol3D .NE. mycol3D) THEN
DEALLOCATE (product_matrix3D%mats(irow3D, icol3D)%matrix)
END IF
END DO
END DO
CALL dbcsr_data_clear_pointer(data_send)
CALL dbcsr_data_release(data_send)
END IF
DEALLOCATE (product_matrix3D%mats)
! Finalize local layer
!$OMP PARALLEL DEFAULT (NONE) &
!$OMP SHARED (multrec, myrow3D, mycol3D) &
!$OMP PRIVATE (ithread)
ithread = 0
!$ ithread = omp_get_thread_num()
CALL dbcsr_mm_multrec_finalize(multrec(ithread, myrow3D, mycol3D)%p)
DEALLOCATE (multrec(ithread, myrow3D, mycol3D)%p)
!$OMP END PARALLEL
DEALLOCATE (multrec)
DEALLOCATE (g2l_map_rows, g2l_map_cols)
CALL dbcsr_finalize(product_matrix, reshuffle=PRESENT(filter_eps) .AND. .NOT. keep_sparsity)
!
DO irow3D = 1, nrows3D
DEALLOCATE (m_sizes(irow3D)%sizes)
END DO
DEALLOCATE (m_sizes)
DO icol3D = 1, ncols3D
DEALLOCATE (n_sizes(icol3D)%sizes)
END DO
DEALLOCATE (n_sizes)
IF (ASSOCIATED(k_sizes)) DEALLOCATE (k_sizes)
!
CALL dbcsr_data_clear_pointer(data_get)
CALL dbcsr_data_release(data_get)
!
! clean-up of communication buffers
DO v_ki = 1, nbuffers
CALL dbcsr_data_clear_pointer(left_buffers(v_ki)%data)
IF (left_buffers(v_ki)%data%d%memory_type%acc_devalloc) THEN
CALL acc_event_destroy(left_buffers(v_ki)%data%d%acc_ready)
END IF
CALL dbcsr_data_release(left_buffers(v_ki)%data)
NULLIFY (left_buffers(v_ki)%matrix%index)
CALL dbcsr_release(left_buffers(v_ki)%matrix)
!
CALL dbcsr_data_clear_pointer(right_buffers(v_ki)%data)
IF (right_buffers(v_ki)%data%d%memory_type%acc_devalloc) THEN
CALL acc_event_destroy(right_buffers(v_ki)%data%d%acc_ready)
END IF
CALL dbcsr_data_release(right_buffers(v_ki)%data)
NULLIFY (right_buffers(v_ki)%matrix%index)
CALL dbcsr_release(right_buffers(v_ki)%matrix)
IF (has_acc) THEN
CALL dbcsr_data_clear_pointer(right_buffers(v_ki)%trs_stackbuf)
IF (right_buffers(v_ki)%trs_stackbuf%d%memory_type%acc_devalloc) THEN
CALL acc_event_destroy(right_buffers(v_ki)%trs_stackbuf%d%acc_ready)
END IF
CALL dbcsr_data_release(right_buffers(v_ki)%trs_stackbuf)
END IF
END DO
DEALLOCATE (left_buffers, right_buffers)
DEALLOCATE (do_comm_left, do_comm_right)
DEALLOCATE (right_vcol, left_vrow)
IF (has_acc) THEN
DEALLOCATE (row_blk_sizes2enum, enum2row_blk_sizes)
DEALLOCATE (col_blk_sizes2enum, enum2col_blk_sizes)
END IF
!
IF (debug_mod) THEN
v_ki = 0
DO blk = 1, SIZE(product_matrix%blk_p)
v_ki = MAX(v_ki, ABS(product_matrix%blk_p(blk)))
END DO
WRITE (*, *) routineN//" Actual final size", &
LOG(REAL(dbcsr_data_get_size(product_matrix%data_area)))/LOG(10.0), &
LOG(REAL(v_ki))/LOG(10.0)
END IF
!
CALL timestop(handle)
END SUBROUTINE multiply_3D