Makes column-based and row-based images of a matrix.
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
type(dbcsr_type), | intent(in) | :: | ism |
input symmetric matrix |
||
type(dbcsr_2d_array_type), | intent(out) | :: | ums |
normalized matrices |
||
type(dbcsr_imagedistribution_obj), | intent(in) | :: | target_imgdist |
image distribution to normalize to |
||
logical, | intent(in), | optional | :: | desymmetrize |
desymmetrize a symmetric matrix |
|
character, | intent(in), | optional | :: | predistribute |
predistribute data for multiplication |
|
logical, | intent(in), | optional | :: | no_copy_data |
try to not merge data at the end |
|
type(dbcsr_scalar_type), | intent(in), | optional | :: | scale_value |
scale with this value |
SUBROUTINE make_images(ism, ums, target_imgdist, desymmetrize, predistribute, &
no_copy_data, scale_value)
!! Makes column-based and row-based images of a matrix.
TYPE(dbcsr_type), INTENT(IN) :: ism
!! input symmetric matrix
TYPE(dbcsr_2d_array_type), INTENT(OUT) :: ums
!! normalized matrices
TYPE(dbcsr_imagedistribution_obj), INTENT(IN) :: target_imgdist
!! image distribution to normalize to
LOGICAL, INTENT(IN), OPTIONAL :: desymmetrize
!! desymmetrize a symmetric matrix
CHARACTER, INTENT(IN), OPTIONAL :: predistribute
!! predistribute data for multiplication
LOGICAL, INTENT(IN), OPTIONAL :: no_copy_data
!! try to not merge data at the end
TYPE(dbcsr_scalar_type), INTENT(IN), OPTIONAL :: scale_value
!! scale with this value
CHARACTER(len=*), PARAMETER :: routineN = 'make_images'
INTEGER, PARAMETER :: metalen = 5
LOGICAL, PARAMETER :: dbg = .FALSE.
CHARACTER :: predist_type, predist_type_fwd
INTEGER :: blk, blk_l, blk_p, bp, col, col_img, col_size, coli, data_type, dst_p, handle, &
handle2, ithread, mp_group, ncol_images, nrow_images, nsymmetries, nthreads, numproc, &
nze, pcol, prow, row, row_img, row_size, rowi, sd_pos, sm_pos, src_p, stored_blk_p, &
stored_col, stored_row, symmetry_i, tr_col_size, tr_row_size, vcol, vrow
INTEGER, ALLOCATABLE, DIMENSION(:) :: myt_sdp, myt_smp, rd_disp, recv_meta, &
rm_disp, sd_disp, sdp, send_meta, &
sm_disp, smp
INTEGER, ALLOCATABLE, DIMENSION(:, :) :: all_total_send_offset, blk_ps, blks, &
myt_total_send_count, &
total_recv_count, total_send_count
INTEGER, ALLOCATABLE, DIMENSION(:, :, :, :) :: myt_send_count, recv_count, send_count
INTEGER, DIMENSION(:), CONTIGUOUS, POINTER :: col_blk_size, col_dist, col_img_dist, &
row_blk_size, row_dist, row_img_dist
INTEGER, DIMENSION(:, :), CONTIGUOUS, POINTER :: blacs2mpi
LOGICAL :: nocopy, tr
TYPE(dbcsr_data_obj) :: received_data_area, recv_data_area, &
send_data_area
TYPE(dbcsr_distribution_obj) :: old_dist, target_dist
TYPE(dbcsr_iterator) :: iter
TYPE(dbcsr_memtype_type) :: data_memory_type
TYPE(dbcsr_mp_obj) :: mp_obj
TYPE(dbcsr_scalar_type) :: scale_neg_one
TYPE(dbcsr_type) :: sm
! ---------------------------------------------------------------------------
! Check input matrix
! Set convenient variables to access input matrix info
!
CALL timeset(routineN, handle)
nocopy = .FALSE.
IF (PRESENT(no_copy_data)) nocopy = no_copy_data
sm = ism
nsymmetries = 1
IF (PRESENT(desymmetrize)) THEN
IF (desymmetrize .AND. sm%symmetry) THEN
nsymmetries = 2
END IF
END IF
SELECT CASE (predistribute)
CASE ('L', 'l')
predist_type = 'L'
predist_type_fwd = 'l'
CASE ('R', 'r')
predist_type = 'R'
predist_type_fwd = 'r'
CASE default
DBCSR_ABORT("Incorrect pre-shift specifier.")
END SELECT
data_type = sm%data_type
IF (data_type .NE. dbcsr_type_real_8 .AND. &
data_type .NE. dbcsr_type_real_4 .AND. &
data_type .NE. dbcsr_type_complex_8 .AND. &
data_type .NE. dbcsr_type_complex_4) &
DBCSR_ABORT("Invalid data type.")
scale_neg_one = dbcsr_scalar_negative(dbcsr_scalar_one(data_type))
row_blk_size => array_data(sm%row_blk_size)
col_blk_size => array_data(sm%col_blk_size)
old_dist = dbcsr_distribution(ism)
target_dist = target_imgdist%i%main
row_dist => dbcsr_distribution_row_dist(target_dist)
col_dist => dbcsr_distribution_col_dist(target_dist)
IF (sm%symmetry) THEN
IF (SIZE(row_dist) .NE. SIZE(col_dist)) &
DBCSR_WARN('Unequal row and column distributions for symmetric matrix.')
END IF
nrow_images = target_imgdist%i%row_decimation
IF (nrow_images .GT. 1) THEN
row_img_dist => array_data(target_imgdist%i%row_image)
ELSE
NULLIFY (row_img_dist)
END IF
ncol_images = target_imgdist%i%col_decimation
IF (ncol_images .GT. 1) THEN
col_img_dist => array_data(target_imgdist%i%col_image)
ELSE
NULLIFY (col_img_dist)
END IF
mp_obj = dbcsr_distribution_mp(target_dist)
blacs2mpi => dbcsr_mp_pgrid(mp_obj)
numproc = dbcsr_mp_numnodes(mp_obj)
mp_group = dbcsr_mp_group(mp_obj)
IF (dbcsr_distribution_max_row_dist(old_dist) .GT. UBOUND(blacs2mpi, 1)) &
DBCSR_ABORT('Row distribution references unexistent processor rows')
IF (dbg) THEN
IF (dbcsr_distribution_max_row_dist(old_dist) .NE. UBOUND(blacs2mpi, 1)) &
DBCSR_WARN('Range of row distribution not equal to processor rows')
END IF
IF (dbcsr_distribution_max_col_dist(old_dist) .GT. UBOUND(blacs2mpi, 2)) &
DBCSR_ABORT('Col distribution references unexistent processor cols')
IF (dbg) THEN
IF (dbcsr_distribution_max_col_dist(old_dist) .NE. UBOUND(blacs2mpi, 2)) &
DBCSR_WARN('Range of col distribution not equal to processor cols')
END IF
! Check threads configuration
!$ IF (.NOT. dbcsr_distribution_has_threads(old_dist)) &
!$ DBCSR_ABORT("Thread distribution not defined")
! Allocate shared temporary buffers
!
ALLOCATE (send_count(2, nrow_images, ncol_images, 0:numproc - 1)); send_count = 0
ALLOCATE (recv_count(2, nrow_images, ncol_images, 0:numproc - 1))
ALLOCATE (total_send_count(2, 0:numproc - 1)); total_send_count = 0
ALLOCATE (total_recv_count(2, 0:numproc - 1))
ALLOCATE (sdp(0:numproc - 1))
ALLOCATE (smp(0:numproc - 1))
ALLOCATE (sd_disp(0:numproc - 1)); sd_disp(0) = 1
ALLOCATE (sm_disp(0:numproc - 1)); sm_disp(0) = 1
ALLOCATE (rd_disp(0:numproc - 1)); rd_disp(0) = 1
ALLOCATE (rm_disp(0:numproc - 1)); rm_disp(0) = 1
ALLOCATE (all_total_send_offset(2, 0:numproc - 1))
ALLOCATE (blk_ps(nrow_images, ncol_images)); blk_ps = 1
ALLOCATE (blks(nrow_images, ncol_images)); blks = 1
!
! Allocate and init mats
!
ALLOCATE (ums%mats(nrow_images, ncol_images))
data_memory_type = memtype_abpanel_1
DO row_img = 1, nrow_images
DO col_img = 1, ncol_images
ums%mats(row_img, col_img) = dbcsr_type()
CALL dbcsr_create(ums%mats(row_img, col_img), "imaged "//sm%name, &
target_dist, &
dbcsr_type_no_symmetry, &
row_blk_size_obj=sm%row_blk_size, col_blk_size_obj=sm%col_blk_size, &
nze=0, data_type=data_type, &
max_rbs=sm%max_rbs, max_cbs=sm%max_cbs, &
row_blk_offset=sm%row_blk_offset, col_blk_offset=sm%col_blk_offset, &
thread_dist=sm%dist, &
data_memory_type=data_memory_type, &
index_memory_type=memtype_mpi_buffer)
ums%mats(row_img, col_img)%negate_real = sm%negate_real
ums%mats(row_img, col_img)%negate_imaginary = sm%negate_imaginary
END DO
END DO
nthreads = 1
!$OMP PARALLEL DEFAULT (NONE) &
!$OMP PRIVATE (ithread, symmetry_i, row_img, col_img, &
!$OMP myt_send_count, myt_total_send_count, &
!$OMP iter, row, col, blk, row_size, col_size, stored_row, stored_col, &
!$OMP prow, pcol, rowi, coli, vrow, vcol, dst_p, nze, myt_smp, myt_sdp, &
!$OMP blk_p, bp, sd_pos, sm_pos,tr, &
!$OMP tr_row_size, tr_col_size) &
!$OMP SHARED (nthreads, nsymmetries, row_img_dist, col_img_dist, &
!$OMP nrow_images, ncol_images, numproc, scale_value, &
!$OMP ums, sm, ism, target_imgdist, row_dist, col_dist,&
!$OMP predist_type_fwd, blacs2mpi, row_blk_size, col_blk_size, &
!$OMP send_count, recv_count, handle2,mp_group, &
!$OMP total_send_count, total_recv_count, recv_data_area, nocopy, &
!$OMP data_type, recv_meta, send_data_area, send_meta, &
!$OMP sd_disp, sm_disp, rd_disp, rm_disp, all_total_send_offset, blk_ps, blks, &
!$OMP received_data_area, scale_neg_one, memtype_abpanel_1)
ithread = 0
!$ ithread = omp_get_thread_num()
!$OMP MASTER
!$ nthreads = omp_get_num_threads()
!$OMP END MASTER
! Allocate thread private data
!
ALLOCATE (myt_send_count(2, nrow_images, ncol_images, 0:numproc - 1)); myt_send_count(:, :, :, :) = 0
ALLOCATE (myt_total_send_count(2, 0:numproc - 1))
! Thread-local pointers of the current adding position into the send buffers
ALLOCATE (myt_smp(0:numproc - 1), myt_sdp(0:numproc - 1))
! Count sizes for sending
!
CALL dbcsr_iterator_start(iter, ism, shared=.TRUE.)
DO WHILE (dbcsr_iterator_blocks_left(iter))
CALL dbcsr_iterator_next_block(iter, row, col, blk, &
row_size=row_size, col_size=col_size)
nze = row_size*col_size
IF (nze .EQ. 0) CYCLE
DO symmetry_i = 1, nsymmetries
IF (symmetry_i .EQ. 1) THEN
stored_row = row; stored_col = col
ELSE
IF (row .EQ. col) CYCLE
stored_row = col; stored_col = row
END IF
! Where do we send this block?
row_img = 1
IF (nrow_images .GT. 1) row_img = row_img_dist(stored_row)
col_img = 1
IF (ncol_images .GT. 1) col_img = col_img_dist(stored_col)
CALL image_calculator(target_imgdist, &
prow=prow, rowi=rowi, &
pcol=pcol, coli=coli, &
vprow=vrow, vpcol=vcol, &
myprow=row_dist(stored_row), myrowi=row_img, &
mypcol=col_dist(stored_col), mycoli=col_img, &
shifting=predist_type_fwd)
dst_p = blacs2mpi(prow, pcol)
! These counts are meant for the thread that processes this row.
myt_send_count(1, rowi, coli, dst_p) = &
myt_send_count(1, rowi, coli, dst_p) + 1
myt_send_count(2, rowi, coli, dst_p) = &
myt_send_count(2, rowi, coli, dst_p) + nze
END DO ! symmetry_i
END DO
CALL dbcsr_iterator_stop(iter)
DO dst_p = 0, numproc - 1
myt_total_send_count(1, dst_p) = SUM(myt_send_count(1, :, :, dst_p))
myt_total_send_count(2, dst_p) = SUM(myt_send_count(2, :, :, dst_p))
END DO
! Merge the send counts
!$OMP CRITICAL
send_count(:, :, :, :) = send_count(:, :, :, :) + myt_send_count(:, :, :, :)
total_send_count(:, :) = total_send_count(:, :) + myt_total_send_count(:, :)
!$OMP END CRITICAL
!$OMP BARRIER
!$OMP MASTER
CALL timeset(routineN//"_sizes", handle2)
CALL mp_alltoall(send_count, recv_count, 2*nrow_images*ncol_images, &
mp_group)
CALL timestop(handle2)
!$OMP END MASTER
!$OMP BARRIER
! Fill in the meta data structures and copy the data.
!$OMP DO
DO dst_p = 0, numproc - 1
total_recv_count(1, dst_p) = SUM(recv_count(1, :, :, dst_p))
total_recv_count(2, dst_p) = SUM(recv_count(2, :, :, dst_p))
END DO
!$OMP MASTER
! Allocate data structures needed for data exchange.
CALL dbcsr_data_init(recv_data_area)
CALL dbcsr_data_init(send_data_area)
IF (nrow_images .EQ. 1 .AND. ncol_images .EQ. 1 .OR. nocopy) THEN
! For some cases the faster dbcsr_special_finalize(reshuffle=.FALSE.) can be used.
! This basically makes this working matrix the actual data-area.
! Hence, for those cases we have to use data_memory_type already here.
CALL dbcsr_data_new(recv_data_area, data_type, SUM(total_recv_count(2, :)), memory_type=memtype_abpanel_1)
! Some MPI implementations have high overhead when encountering a new buffer.
! Therefore we also use the memory pool for the send buffer.
CALL dbcsr_data_new(send_data_area, data_type, SUM(total_send_count(2, :)), memory_type=memtype_abpanel_1)
ELSE
CALL dbcsr_data_new(recv_data_area, data_type, SUM(total_recv_count(2, :)))
CALL dbcsr_data_new(send_data_area, data_type, SUM(total_send_count(2, :)))
END IF
ALLOCATE (recv_meta(metalen*SUM(total_recv_count(1, :))))
ALLOCATE (send_meta(metalen*SUM(total_send_count(1, :))))
! Calculate displacements for processors needed for the exchanges.
DO dst_p = 1, numproc - 1
sm_disp(dst_p) = sm_disp(dst_p - 1) &
+ metalen*total_send_count(1, dst_p - 1)
sd_disp(dst_p) = sd_disp(dst_p - 1) &
+ total_send_count(2, dst_p - 1)
rm_disp(dst_p) = rm_disp(dst_p - 1) &
+ metalen*total_recv_count(1, dst_p - 1)
rd_disp(dst_p) = rd_disp(dst_p - 1) &
+ total_recv_count(2, dst_p - 1)
END DO
myt_smp(:) = sm_disp(:)
myt_sdp(:) = sd_disp(:)
IF (nthreads .GT. 1) THEN
all_total_send_offset(1, :) = myt_smp(:) + metalen*myt_total_send_count(1, :)
all_total_send_offset(2, :) = myt_sdp(:) + myt_total_send_count(2, :)
END IF
!$OMP END MASTER
!$OMP BARRIER
IF (ithread .GT. 0) THEN
!$OMP CRITICAL
myt_smp(:) = all_total_send_offset(1, :)
myt_sdp(:) = all_total_send_offset(2, :)
all_total_send_offset(1, :) &
= all_total_send_offset(1, :) + metalen*myt_total_send_count(1, :)
all_total_send_offset(2, :) &
= all_total_send_offset(2, :) + myt_total_send_count(2, :)
!$OMP END CRITICAL
ELSE
CALL dbcsr_data_init(received_data_area)
received_data_area = recv_data_area
CALL dbcsr_data_hold(received_data_area)
DO row_img = 1, nrow_images
DO col_img = 1, ncol_images
CALL dbcsr_work_create(ums%mats(row_img, col_img), &
SUM(recv_count(1, row_img, col_img, :)), n=1)
CALL dbcsr_data_hold(received_data_area)
CALL dbcsr_data_release(ums%mats(row_img, col_img)%wms(1)%data_area)
ums%mats(row_img, col_img)%wms(1)%data_area = received_data_area
END DO
END DO
END IF
!$OMP BARRIER
! Add timing call to the packing of the send buffers
!
CALL timeset(routineN//"_pack", handle2)
! Copies metadata and actual data to be sent into the send buffers.
CALL dbcsr_iterator_start(iter, ism, shared=.TRUE.)
SELECT CASE (data_type)
CASE (dbcsr_type_real_4)
CALL prepare_buffers_s(sm%negate_real, sm%negate_imaginary, &
iter, row, col, blk, blk_p, bp, &
row_size, col_size, nze, nsymmetries, symmetry_i, &
stored_row, stored_col, tr_row_size, tr_col_size, tr, &
row_img, col_img, nrow_images, ncol_images, &
row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, &
target_imgdist, prow, pcol, rowi, coli, &
row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, &
sd_pos, myt_sdp, send_meta, sd_disp, &
dbcsr_get_data_p_s(sm%data_area), &
send_data_area, scale_neg_one, scale_value)
CASE (dbcsr_type_real_8)
CALL prepare_buffers_d(sm%negate_real, sm%negate_imaginary, &
iter, row, col, blk, blk_p, bp, &
row_size, col_size, nze, nsymmetries, symmetry_i, &
stored_row, stored_col, tr_row_size, tr_col_size, tr, &
row_img, col_img, nrow_images, ncol_images, &
row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, &
target_imgdist, prow, pcol, rowi, coli, &
row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, &
sd_pos, myt_sdp, send_meta, sd_disp, &
dbcsr_get_data_p_d(sm%data_area), &
send_data_area, scale_neg_one, scale_value)
CASE (dbcsr_type_complex_4)
CALL prepare_buffers_c(sm%negate_real, sm%negate_imaginary, &
iter, row, col, blk, blk_p, bp, &
row_size, col_size, nze, nsymmetries, symmetry_i, &
stored_row, stored_col, tr_row_size, tr_col_size, tr, &
row_img, col_img, nrow_images, ncol_images, &
row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, &
target_imgdist, prow, pcol, rowi, coli, &
row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, &
sd_pos, myt_sdp, send_meta, sd_disp, &
dbcsr_get_data_p_c(sm%data_area), &
send_data_area, scale_neg_one, scale_value)
CASE (dbcsr_type_complex_8)
CALL prepare_buffers_z(sm%negate_real, sm%negate_imaginary, &
iter, row, col, blk, blk_p, bp, &
row_size, col_size, nze, nsymmetries, symmetry_i, &
stored_row, stored_col, tr_row_size, tr_col_size, tr, &
row_img, col_img, nrow_images, ncol_images, &
row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, &
target_imgdist, prow, pcol, rowi, coli, &
row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, &
sd_pos, myt_sdp, send_meta, sd_disp, &
dbcsr_get_data_p_z(sm%data_area), &
send_data_area, scale_neg_one, scale_value)
END SELECT
CALL dbcsr_iterator_stop(iter)
! Deallocate thread private data
!
DEALLOCATE (myt_send_count)
DEALLOCATE (myt_total_send_count)
DEALLOCATE (myt_smp, myt_sdp)
CALL timestop(handle2)
!$OMP END PARALLEL
! Exchange the data and metadata structures. In the interesting cases (square grids, row col distribution same),
! there are only very few processors that need to exchange data.
! The hybrid_alltoall deals with this by doing point to point communication
CALL timeset(routineN//"_data", handle2)
CALL hybrid_alltoall_any(send_data_area, total_send_count(2, :), sd_disp(:) - 1, &
recv_data_area, total_recv_count(2, :), rd_disp(:) - 1, &
mp_obj, &
most_ptp=.TRUE., remainder_ptp=.TRUE., no_hybrid=.FALSE.)
CALL hybrid_alltoall_i1( &
send_meta(:), metalen*total_send_count(1, :), sm_disp(:) - 1, &
recv_meta(:), metalen*total_recv_count(1, :), rm_disp(:) - 1, &
most_ptp=.TRUE., remainder_ptp=.TRUE., no_hybrid=.FALSE., &
mp_env=mp_obj)
CALL timestop(handle2)
! Now create the work index and/or copy the relevant data from the
! receive buffer into the local indices.
!
DO src_p = 0, numproc - 1
DO blk_l = 1, total_recv_count(1, src_p)
stored_row = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1))
stored_col = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1) + 1)
stored_blk_p = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1) + 2)
row_img = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1) + 3)
col_img = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1) + 4)
nze = row_blk_size(stored_row)*col_blk_size(stored_col)
blk = blks(row_img, col_img)
blks(row_img, col_img) = blks(row_img, col_img) + 1
blk_ps(row_img, col_img) = blk_ps(row_img, col_img) + nze
ums%mats(row_img, col_img)%wms(1)%row_i(blk) = stored_row
ums%mats(row_img, col_img)%wms(1)%col_i(blk) = stored_col
ums%mats(row_img, col_img)%wms(1)%blk_p(blk) = &
SIGN(rd_disp(src_p) + ABS(stored_blk_p) - 1, stored_blk_p)
END DO
END DO
! Finalize the actual imaged matrices from the work matrices
!
DO row_img = 1, nrow_images
DO col_img = 1, ncol_images
ums%mats(row_img, col_img)%wms(1)%lastblk = blks(row_img, col_img) - 1
ums%mats(row_img, col_img)%wms(1)%datasize = blk_ps(row_img, col_img) - 1
!
CALL dbcsr_data_set_size_referenced( &
ums%mats(row_img, col_img)%wms(1)%data_area, &
ums%mats(row_img, col_img)%wms(1)%datasize)
IF (nrow_images .EQ. 1 .AND. ncol_images .EQ. 1 .OR. nocopy) THEN
CALL dbcsr_special_finalize(ums%mats(row_img, col_img), reshuffle=.FALSE.)
ELSE
CALL dbcsr_special_finalize(ums%mats(row_img, col_img), reshuffle=.TRUE.)
END IF
! Save the home process and image row and column
CALL image_calculator(target_imgdist, &
ums%mats(row_img, col_img)%index(dbcsr_slot_home_prow), &
ums%mats(row_img, col_img)%index(dbcsr_slot_home_rowi), &
ums%mats(row_img, col_img)%index(dbcsr_slot_home_pcol), &
ums%mats(row_img, col_img)%index(dbcsr_slot_home_coli), &
vprow=ums%mats(row_img, col_img)%index(dbcsr_slot_home_vprow), &
vpcol=ums%mats(row_img, col_img)%index(dbcsr_slot_home_vpcol), &
myrowi=row_img, mycoli=col_img, &
shifting=predist_type)
END DO
END DO
! Deallocate shared temporary buffers
!
DEALLOCATE (send_count, recv_count)
DEALLOCATE (total_send_count, total_recv_count)
DEALLOCATE (sdp, smp, sd_disp, sm_disp)
DEALLOCATE (rd_disp, rm_disp)
DEALLOCATE (all_total_send_offset)
DEALLOCATE (blk_ps, blks)
DEALLOCATE (recv_meta, send_meta)
CALL dbcsr_data_release(send_data_area)
CALL dbcsr_data_release(received_data_area)
CALL dbcsr_data_release(recv_data_area)
CALL timestop(handle)
END SUBROUTINE make_images