Makes column-based and row-based images of a matrix.
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
type(dbcsr_type), | intent(in) | :: | ism |
input symmetric matrix |
||
type(dbcsr_2d_array_type), | intent(out) | :: | ums |
normalized matrices |
||
type(dbcsr_imagedistribution_obj), | intent(in) | :: | target_imgdist |
image distribution to normalize to |
||
logical, | intent(in), | optional | :: | desymmetrize |
desymmetrize a symmetric matrix |
|
character(len=1), | intent(in), | optional | :: | predistribute |
predistribute data for multiplication |
|
logical, | intent(in), | optional | :: | no_copy_data |
try to not merge data at the end |
|
type(dbcsr_scalar_type), | intent(in), | optional | :: | scale_value |
scale with this value |
SUBROUTINE make_images(ism, ums, target_imgdist, desymmetrize, predistribute, & no_copy_data, scale_value) !! Makes column-based and row-based images of a matrix. TYPE(dbcsr_type), INTENT(IN) :: ism !! input symmetric matrix TYPE(dbcsr_2d_array_type), INTENT(OUT) :: ums !! normalized matrices TYPE(dbcsr_imagedistribution_obj), INTENT(IN) :: target_imgdist !! image distribution to normalize to LOGICAL, INTENT(IN), OPTIONAL :: desymmetrize !! desymmetrize a symmetric matrix CHARACTER, INTENT(IN), OPTIONAL :: predistribute !! predistribute data for multiplication LOGICAL, INTENT(IN), OPTIONAL :: no_copy_data !! try to not merge data at the end TYPE(dbcsr_scalar_type), INTENT(IN), OPTIONAL :: scale_value !! scale with this value CHARACTER(len=*), PARAMETER :: routineN = 'make_images' INTEGER, PARAMETER :: metalen = 5 LOGICAL, PARAMETER :: dbg = .FALSE. CHARACTER :: predist_type, predist_type_fwd INTEGER :: blk, blk_l, blk_p, bp, col, col_img, col_size, coli, data_type, dst_p, handle, & handle2, ithread, ncol_images, nrow_images, nsymmetries, nthreads, numproc, & nze, pcol, prow, row, row_img, row_size, rowi, sd_pos, sm_pos, src_p, stored_blk_p, & stored_col, stored_row, symmetry_i, tr_col_size, tr_row_size, vcol, vrow INTEGER, ALLOCATABLE, DIMENSION(:) :: myt_sdp, myt_smp, rd_disp, recv_meta, & rm_disp, sd_disp, sdp, send_meta, & sm_disp, smp INTEGER, ALLOCATABLE, DIMENSION(:, :) :: all_total_send_offset, blk_ps, blks, & myt_total_send_count, & total_recv_count, total_send_count INTEGER, ALLOCATABLE, DIMENSION(:, :, :, :) :: myt_send_count, recv_count, send_count INTEGER, DIMENSION(:), CONTIGUOUS, POINTER :: col_blk_size, col_dist, col_img_dist, & row_blk_size, row_dist, row_img_dist INTEGER, DIMENSION(:, :), CONTIGUOUS, POINTER :: blacs2mpi LOGICAL :: nocopy, tr TYPE(dbcsr_data_obj) :: received_data_area, recv_data_area, & send_data_area TYPE(dbcsr_distribution_obj) :: old_dist, target_dist TYPE(dbcsr_iterator) :: iter TYPE(dbcsr_memtype_type) :: data_memory_type TYPE(dbcsr_mp_obj) :: mp_obj TYPE(dbcsr_scalar_type) :: scale_neg_one TYPE(dbcsr_type) :: sm TYPE(mp_comm_type) :: mp_group ! --------------------------------------------------------------------------- ! Check input matrix ! Set convenient variables to access input matrix info ! CALL timeset(routineN, handle) nocopy = .FALSE. IF (PRESENT(no_copy_data)) nocopy = no_copy_data sm = ism nsymmetries = 1 IF (PRESENT(desymmetrize)) THEN IF (desymmetrize .AND. sm%symmetry) THEN nsymmetries = 2 END IF END IF SELECT CASE (predistribute) CASE ('L', 'l') predist_type = 'L' predist_type_fwd = 'l' CASE ('R', 'r') predist_type = 'R' predist_type_fwd = 'r' CASE default DBCSR_ABORT("Incorrect pre-shift specifier.") END SELECT data_type = sm%data_type IF (data_type .NE. dbcsr_type_real_8 .AND. & data_type .NE. dbcsr_type_real_4 .AND. & data_type .NE. dbcsr_type_complex_8 .AND. & data_type .NE. dbcsr_type_complex_4) & DBCSR_ABORT("Invalid data type.") scale_neg_one = dbcsr_scalar_negative(dbcsr_scalar_one(data_type)) row_blk_size => array_data(sm%row_blk_size) col_blk_size => array_data(sm%col_blk_size) old_dist = dbcsr_distribution(ism) target_dist = target_imgdist%i%main row_dist => dbcsr_distribution_row_dist(target_dist) col_dist => dbcsr_distribution_col_dist(target_dist) IF (sm%symmetry) THEN IF (SIZE(row_dist) .NE. SIZE(col_dist)) & DBCSR_WARN('Unequal row and column distributions for symmetric matrix.') END IF nrow_images = target_imgdist%i%row_decimation IF (nrow_images .GT. 1) THEN row_img_dist => array_data(target_imgdist%i%row_image) ELSE NULLIFY (row_img_dist) END IF ncol_images = target_imgdist%i%col_decimation IF (ncol_images .GT. 1) THEN col_img_dist => array_data(target_imgdist%i%col_image) ELSE NULLIFY (col_img_dist) END IF mp_obj = dbcsr_distribution_mp(target_dist) blacs2mpi => dbcsr_mp_pgrid(mp_obj) numproc = dbcsr_mp_numnodes(mp_obj) mp_group = dbcsr_mp_group(mp_obj) IF (dbcsr_distribution_max_row_dist(old_dist) .GT. UBOUND(blacs2mpi, 1)) & DBCSR_ABORT('Row distribution references unexistent processor rows') IF (dbg) THEN IF (dbcsr_distribution_max_row_dist(old_dist) .NE. UBOUND(blacs2mpi, 1)) & DBCSR_WARN('Range of row distribution not equal to processor rows') END IF IF (dbcsr_distribution_max_col_dist(old_dist) .GT. UBOUND(blacs2mpi, 2)) & DBCSR_ABORT('Col distribution references unexistent processor cols') IF (dbg) THEN IF (dbcsr_distribution_max_col_dist(old_dist) .NE. UBOUND(blacs2mpi, 2)) & DBCSR_WARN('Range of col distribution not equal to processor cols') END IF ! Check threads configuration !$ IF (.NOT. dbcsr_distribution_has_threads(old_dist)) & !$ DBCSR_ABORT("Thread distribution not defined") ! Allocate shared temporary buffers ! ALLOCATE (send_count(2, nrow_images, ncol_images, 0:numproc - 1)); send_count = 0 ALLOCATE (recv_count(2, nrow_images, ncol_images, 0:numproc - 1)) ALLOCATE (total_send_count(2, 0:numproc - 1)); total_send_count = 0 ALLOCATE (total_recv_count(2, 0:numproc - 1)) ALLOCATE (sdp(0:numproc - 1)) ALLOCATE (smp(0:numproc - 1)) ALLOCATE (sd_disp(0:numproc - 1)); sd_disp(0) = 1 ALLOCATE (sm_disp(0:numproc - 1)); sm_disp(0) = 1 ALLOCATE (rd_disp(0:numproc - 1)); rd_disp(0) = 1 ALLOCATE (rm_disp(0:numproc - 1)); rm_disp(0) = 1 ALLOCATE (all_total_send_offset(2, 0:numproc - 1)) ALLOCATE (blk_ps(nrow_images, ncol_images)); blk_ps = 1 ALLOCATE (blks(nrow_images, ncol_images)); blks = 1 ! ! Allocate and init mats ! ALLOCATE (ums%mats(nrow_images, ncol_images)) data_memory_type = memtype_abpanel_1 DO row_img = 1, nrow_images DO col_img = 1, ncol_images ums%mats(row_img, col_img) = dbcsr_type() CALL dbcsr_create(ums%mats(row_img, col_img), "imaged "//sm%name, & target_dist, & dbcsr_type_no_symmetry, & row_blk_size_obj=sm%row_blk_size, col_blk_size_obj=sm%col_blk_size, & nze=0, data_type=data_type, & max_rbs=sm%max_rbs, max_cbs=sm%max_cbs, & row_blk_offset=sm%row_blk_offset, col_blk_offset=sm%col_blk_offset, & thread_dist=sm%dist, & data_memory_type=data_memory_type, & index_memory_type=memtype_mpi_buffer) ums%mats(row_img, col_img)%negate_real = sm%negate_real ums%mats(row_img, col_img)%negate_imaginary = sm%negate_imaginary END DO END DO nthreads = 1 !$OMP PARALLEL DEFAULT (NONE) & !$OMP PRIVATE (ithread, symmetry_i, row_img, col_img, & !$OMP myt_send_count, myt_total_send_count, & !$OMP iter, row, col, blk, row_size, col_size, stored_row, stored_col, & !$OMP prow, pcol, rowi, coli, vrow, vcol, dst_p, nze, myt_smp, myt_sdp, & !$OMP blk_p, bp, sd_pos, sm_pos,tr, & !$OMP tr_row_size, tr_col_size) & !$OMP SHARED (nthreads, nsymmetries, row_img_dist, col_img_dist, & !$OMP nrow_images, ncol_images, numproc, scale_value, & !$OMP ums, sm, ism, target_imgdist, row_dist, col_dist,& !$OMP predist_type_fwd, blacs2mpi, row_blk_size, col_blk_size, & !$OMP send_count, recv_count, handle2,mp_group, & !$OMP total_send_count, total_recv_count, recv_data_area, nocopy, & !$OMP data_type, recv_meta, send_data_area, send_meta, & !$OMP sd_disp, sm_disp, rd_disp, rm_disp, all_total_send_offset, blk_ps, blks, & !$OMP received_data_area, scale_neg_one, memtype_abpanel_1) ithread = 0 !$ ithread = omp_get_thread_num() !$OMP MASTER !$ nthreads = omp_get_num_threads() !$OMP END MASTER ! Allocate thread private data ! ALLOCATE (myt_send_count(2, nrow_images, ncol_images, 0:numproc - 1)); myt_send_count(:, :, :, :) = 0 ALLOCATE (myt_total_send_count(2, 0:numproc - 1)) ! Thread-local pointers of the current adding position into the send buffers ALLOCATE (myt_smp(0:numproc - 1), myt_sdp(0:numproc - 1)) ! Count sizes for sending ! CALL dbcsr_iterator_start(iter, ism, shared=.TRUE.) DO WHILE (dbcsr_iterator_blocks_left(iter)) CALL dbcsr_iterator_next_block(iter, row, col, blk, & row_size=row_size, col_size=col_size) nze = row_size*col_size IF (nze .EQ. 0) CYCLE DO symmetry_i = 1, nsymmetries IF (symmetry_i .EQ. 1) THEN stored_row = row; stored_col = col ELSE IF (row .EQ. col) CYCLE stored_row = col; stored_col = row END IF ! Where do we send this block? row_img = 1 IF (nrow_images .GT. 1) row_img = row_img_dist(stored_row) col_img = 1 IF (ncol_images .GT. 1) col_img = col_img_dist(stored_col) CALL image_calculator(target_imgdist, & prow=prow, rowi=rowi, & pcol=pcol, coli=coli, & vprow=vrow, vpcol=vcol, & myprow=row_dist(stored_row), myrowi=row_img, & mypcol=col_dist(stored_col), mycoli=col_img, & shifting=predist_type_fwd) dst_p = blacs2mpi(prow, pcol) ! These counts are meant for the thread that processes this row. myt_send_count(1, rowi, coli, dst_p) = & myt_send_count(1, rowi, coli, dst_p) + 1 myt_send_count(2, rowi, coli, dst_p) = & myt_send_count(2, rowi, coli, dst_p) + nze END DO ! symmetry_i END DO CALL dbcsr_iterator_stop(iter) DO dst_p = 0, numproc - 1 myt_total_send_count(1, dst_p) = SUM(myt_send_count(1, :, :, dst_p)) myt_total_send_count(2, dst_p) = SUM(myt_send_count(2, :, :, dst_p)) END DO ! Merge the send counts !$OMP CRITICAL send_count(:, :, :, :) = send_count(:, :, :, :) + myt_send_count(:, :, :, :) total_send_count(:, :) = total_send_count(:, :) + myt_total_send_count(:, :) !$OMP END CRITICAL !$OMP BARRIER !$OMP MASTER CALL timeset(routineN//"_sizes", handle2) CALL mp_alltoall(send_count, recv_count, 2*nrow_images*ncol_images, & mp_group) CALL timestop(handle2) !$OMP END MASTER !$OMP BARRIER ! Fill in the meta data structures and copy the data. !$OMP DO DO dst_p = 0, numproc - 1 total_recv_count(1, dst_p) = SUM(recv_count(1, :, :, dst_p)) total_recv_count(2, dst_p) = SUM(recv_count(2, :, :, dst_p)) END DO !$OMP MASTER ! Allocate data structures needed for data exchange. CALL dbcsr_data_init(recv_data_area) CALL dbcsr_data_init(send_data_area) IF (nrow_images .EQ. 1 .AND. ncol_images .EQ. 1 .OR. nocopy) THEN ! For some cases the faster dbcsr_special_finalize(reshuffle=.FALSE.) can be used. ! This basically makes this working matrix the actual data-area. ! Hence, for those cases we have to use data_memory_type already here. CALL dbcsr_data_new(recv_data_area, data_type, SUM(total_recv_count(2, :)), memory_type=memtype_abpanel_1) ! Some MPI implementations have high overhead when encountering a new buffer. ! Therefore we also use the memory pool for the send buffer. CALL dbcsr_data_new(send_data_area, data_type, SUM(total_send_count(2, :)), memory_type=memtype_abpanel_1) ELSE CALL dbcsr_data_new(recv_data_area, data_type, SUM(total_recv_count(2, :))) CALL dbcsr_data_new(send_data_area, data_type, SUM(total_send_count(2, :))) END IF ALLOCATE (recv_meta(metalen*SUM(total_recv_count(1, :)))) ALLOCATE (send_meta(metalen*SUM(total_send_count(1, :)))) ! Calculate displacements for processors needed for the exchanges. DO dst_p = 1, numproc - 1 sm_disp(dst_p) = sm_disp(dst_p - 1) & + metalen*total_send_count(1, dst_p - 1) sd_disp(dst_p) = sd_disp(dst_p - 1) & + total_send_count(2, dst_p - 1) rm_disp(dst_p) = rm_disp(dst_p - 1) & + metalen*total_recv_count(1, dst_p - 1) rd_disp(dst_p) = rd_disp(dst_p - 1) & + total_recv_count(2, dst_p - 1) END DO myt_smp(:) = sm_disp(:) myt_sdp(:) = sd_disp(:) IF (nthreads .GT. 1) THEN all_total_send_offset(1, :) = myt_smp(:) + metalen*myt_total_send_count(1, :) all_total_send_offset(2, :) = myt_sdp(:) + myt_total_send_count(2, :) END IF !$OMP END MASTER !$OMP BARRIER IF (ithread .GT. 0) THEN !$OMP CRITICAL myt_smp(:) = all_total_send_offset(1, :) myt_sdp(:) = all_total_send_offset(2, :) all_total_send_offset(1, :) & = all_total_send_offset(1, :) + metalen*myt_total_send_count(1, :) all_total_send_offset(2, :) & = all_total_send_offset(2, :) + myt_total_send_count(2, :) !$OMP END CRITICAL ELSE CALL dbcsr_data_init(received_data_area) received_data_area = recv_data_area CALL dbcsr_data_hold(received_data_area) DO row_img = 1, nrow_images DO col_img = 1, ncol_images CALL dbcsr_work_create(ums%mats(row_img, col_img), & SUM(recv_count(1, row_img, col_img, :)), n=1) CALL dbcsr_data_hold(received_data_area) CALL dbcsr_data_release(ums%mats(row_img, col_img)%wms(1)%data_area) ums%mats(row_img, col_img)%wms(1)%data_area = received_data_area END DO END DO END IF !$OMP BARRIER ! Add timing call to the packing of the send buffers ! CALL timeset(routineN//"_pack", handle2) ! Copies metadata and actual data to be sent into the send buffers. CALL dbcsr_iterator_start(iter, ism, shared=.TRUE.) SELECT CASE (data_type) CASE (dbcsr_type_real_4) CALL prepare_buffers_s(sm%negate_real, sm%negate_imaginary, & iter, row, col, blk, blk_p, bp, & row_size, col_size, nze, nsymmetries, symmetry_i, & stored_row, stored_col, tr_row_size, tr_col_size, tr, & row_img, col_img, nrow_images, ncol_images, & row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, & target_imgdist, prow, pcol, rowi, coli, & row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, & sd_pos, myt_sdp, send_meta, sd_disp, & dbcsr_get_data_p_s(sm%data_area), & send_data_area, scale_neg_one, scale_value) CASE (dbcsr_type_real_8) CALL prepare_buffers_d(sm%negate_real, sm%negate_imaginary, & iter, row, col, blk, blk_p, bp, & row_size, col_size, nze, nsymmetries, symmetry_i, & stored_row, stored_col, tr_row_size, tr_col_size, tr, & row_img, col_img, nrow_images, ncol_images, & row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, & target_imgdist, prow, pcol, rowi, coli, & row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, & sd_pos, myt_sdp, send_meta, sd_disp, & dbcsr_get_data_p_d(sm%data_area), & send_data_area, scale_neg_one, scale_value) CASE (dbcsr_type_complex_4) CALL prepare_buffers_c(sm%negate_real, sm%negate_imaginary, & iter, row, col, blk, blk_p, bp, & row_size, col_size, nze, nsymmetries, symmetry_i, & stored_row, stored_col, tr_row_size, tr_col_size, tr, & row_img, col_img, nrow_images, ncol_images, & row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, & target_imgdist, prow, pcol, rowi, coli, & row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, & sd_pos, myt_sdp, send_meta, sd_disp, & dbcsr_get_data_p_c(sm%data_area), & send_data_area, scale_neg_one, scale_value) CASE (dbcsr_type_complex_8) CALL prepare_buffers_z(sm%negate_real, sm%negate_imaginary, & iter, row, col, blk, blk_p, bp, & row_size, col_size, nze, nsymmetries, symmetry_i, & stored_row, stored_col, tr_row_size, tr_col_size, tr, & row_img, col_img, nrow_images, ncol_images, & row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, & target_imgdist, prow, pcol, rowi, coli, & row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, & sd_pos, myt_sdp, send_meta, sd_disp, & dbcsr_get_data_p_z(sm%data_area), & send_data_area, scale_neg_one, scale_value) END SELECT CALL dbcsr_iterator_stop(iter) ! Deallocate thread private data ! DEALLOCATE (myt_send_count) DEALLOCATE (myt_total_send_count) DEALLOCATE (myt_smp, myt_sdp) CALL timestop(handle2) !$OMP END PARALLEL ! Exchange the data and metadata structures. In the interesting cases (square grids, row col distribution same), ! there are only very few processors that need to exchange data. ! The hybrid_alltoall deals with this by doing point to point communication CALL timeset(routineN//"_data", handle2) CALL hybrid_alltoall_any(send_data_area, total_send_count(2, :), sd_disp(:) - 1, & recv_data_area, total_recv_count(2, :), rd_disp(:) - 1, & mp_obj, & most_ptp=.TRUE., remainder_ptp=.TRUE., no_hybrid=.FALSE.) CALL hybrid_alltoall_i1( & send_meta(:), metalen*total_send_count(1, :), sm_disp(:) - 1, & recv_meta(:), metalen*total_recv_count(1, :), rm_disp(:) - 1, & most_ptp=.TRUE., remainder_ptp=.TRUE., no_hybrid=.FALSE., & mp_env=mp_obj) CALL timestop(handle2) ! Now create the work index and/or copy the relevant data from the ! receive buffer into the local indices. ! DO src_p = 0, numproc - 1 DO blk_l = 1, total_recv_count(1, src_p) stored_row = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1)) stored_col = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1) + 1) stored_blk_p = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1) + 2) row_img = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1) + 3) col_img = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1) + 4) nze = row_blk_size(stored_row)*col_blk_size(stored_col) blk = blks(row_img, col_img) blks(row_img, col_img) = blks(row_img, col_img) + 1 blk_ps(row_img, col_img) = blk_ps(row_img, col_img) + nze ums%mats(row_img, col_img)%wms(1)%row_i(blk) = stored_row ums%mats(row_img, col_img)%wms(1)%col_i(blk) = stored_col ums%mats(row_img, col_img)%wms(1)%blk_p(blk) = & SIGN(rd_disp(src_p) + ABS(stored_blk_p) - 1, stored_blk_p) END DO END DO ! Finalize the actual imaged matrices from the work matrices ! DO row_img = 1, nrow_images DO col_img = 1, ncol_images ums%mats(row_img, col_img)%wms(1)%lastblk = blks(row_img, col_img) - 1 ums%mats(row_img, col_img)%wms(1)%datasize = blk_ps(row_img, col_img) - 1 ! CALL dbcsr_data_set_size_referenced( & ums%mats(row_img, col_img)%wms(1)%data_area, & ums%mats(row_img, col_img)%wms(1)%datasize) IF (nrow_images .EQ. 1 .AND. ncol_images .EQ. 1 .OR. nocopy) THEN CALL dbcsr_special_finalize(ums%mats(row_img, col_img), reshuffle=.FALSE.) ELSE CALL dbcsr_special_finalize(ums%mats(row_img, col_img), reshuffle=.TRUE.) END IF ! Save the home process and image row and column CALL image_calculator(target_imgdist, & ums%mats(row_img, col_img)%index(dbcsr_slot_home_prow), & ums%mats(row_img, col_img)%index(dbcsr_slot_home_rowi), & ums%mats(row_img, col_img)%index(dbcsr_slot_home_pcol), & ums%mats(row_img, col_img)%index(dbcsr_slot_home_coli), & vprow=ums%mats(row_img, col_img)%index(dbcsr_slot_home_vprow), & vpcol=ums%mats(row_img, col_img)%index(dbcsr_slot_home_vpcol), & myrowi=row_img, mycoli=col_img, & shifting=predist_type) END DO END DO ! Deallocate shared temporary buffers ! DEALLOCATE (send_count, recv_count) DEALLOCATE (total_send_count, total_recv_count) DEALLOCATE (sdp, smp, sd_disp, sm_disp) DEALLOCATE (rd_disp, rm_disp) DEALLOCATE (all_total_send_offset) DEALLOCATE (blk_ps, blks) DEALLOCATE (recv_meta, send_meta) CALL dbcsr_data_release(send_data_area) CALL dbcsr_data_release(received_data_area) CALL dbcsr_data_release(recv_data_area) CALL timestop(handle) END SUBROUTINE make_images