Fully redistributes a DBCSR matrix. The new distribution may be arbitrary as long as the total number full rows and columns matches that of the existing matrix.
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
type(dbcsr_type), | intent(in) | :: | matrix |
matrix to redistribute |
||
type(dbcsr_type), | intent(inout) | :: | redist |
redistributed matrix |
||
logical, | intent(in), | optional | :: | keep_sparsity |
retains the sparsity of the redist matrix sum blocks with identical row and col from different processes |
|
logical, | intent(in), | optional | :: | summation |
retains the sparsity of the redist matrix sum blocks with identical row and col from different processes |
SUBROUTINE dbcsr_complete_redistribute(matrix, redist, keep_sparsity, summation) !! Fully redistributes a DBCSR matrix. !! The new distribution may be arbitrary as long as the total !! number full rows and columns matches that of the existing !! matrix. TYPE(dbcsr_type), INTENT(IN) :: matrix !! matrix to redistribute TYPE(dbcsr_type), INTENT(INOUT) :: redist !! redistributed matrix LOGICAL, INTENT(IN), OPTIONAL :: keep_sparsity, summation !! retains the sparsity of the redist matrix !! sum blocks with identical row and col from different processes CHARACTER(len=*), PARAMETER :: routineN = 'dbcsr_complete_redistribute' INTEGER, PARAMETER :: metalen = 7 LOGICAL, PARAMETER :: dbg = .FALSE. INTEGER :: blk, blk_col_new, blk_ps, blk_row_new, blks, cnt_fnd, cnt_new, cnt_skip, col, & col_int, col_offset_new, col_offset_old, col_rle, col_size, col_size_new, data_offset_l, & data_type, dst_p, handle, i, meta_l, numnodes, nze_rle, row, row_int, & row_offset_new, row_offset_old, row_rle, row_size, row_size_new, src_p, stored_col_new, & stored_row_new INTEGER, ALLOCATABLE, DIMENSION(:) :: col_end_new, col_end_old, col_start_new, & col_start_old, rd_disp, recv_meta, rm_disp, row_end_new, row_end_old, row_start_new, & row_start_old, sd_disp, sdp, send_meta, sm_disp, smp INTEGER, ALLOCATABLE, DIMENSION(:, :) :: col_reblocks, n_col_reblocks, n_row_reblocks, & recv_count, row_reblocks, send_count, total_recv_count, total_send_count INTEGER, DIMENSION(:), POINTER :: col_blk_size_new, col_blk_size_old, & col_dist_new, row_blk_size_new, & row_blk_size_old, row_dist_new INTEGER, DIMENSION(:, :), POINTER :: pgrid LOGICAL :: found, my_keep_sparsity, my_summation, & sym, tr, valid_block REAL(kind=dp) :: cs1, cs2 TYPE(dbcsr_data_obj) :: buff_data, data_block, recv_data, & send_data TYPE(dbcsr_distribution_obj) :: dist_new TYPE(dbcsr_iterator) :: iter TYPE(dbcsr_mp_obj) :: mp_obj_new TYPE(mp_comm_type) :: mp_group ! --------------------------------------------------------------------------- CALL timeset(routineN, handle) IF (.NOT. dbcsr_valid_index(matrix)) & DBCSR_ABORT("Input not valid.") IF (matrix%replication_type .NE. dbcsr_repl_none) & DBCSR_WARN("Can not redistribute replicated matrix.") IF (dbcsr_has_symmetry(matrix) .AND. .NOT. dbcsr_has_symmetry(redist)) & DBCSR_ABORT("Can not redistribute a symmetric matrix into a non-symmetric one") ! my_keep_sparsity = .FALSE. IF (PRESENT(keep_sparsity)) my_keep_sparsity = keep_sparsity ! my_summation = .FALSE. IF (PRESENT(summation)) my_summation = summation ! zero blocks that might be present in the target (redist) but not in the source (matrix) CALL dbcsr_set(redist, 0.0_dp) sym = dbcsr_has_symmetry(redist) data_type = matrix%data_type ! Get row and column start and end positions ! Old matrix row_blk_size_old => array_data(matrix%row_blk_size) col_blk_size_old => array_data(matrix%col_blk_size) ALLOCATE (row_start_old(dbcsr_nblkrows_total(matrix)), & row_end_old(dbcsr_nblkrows_total(matrix)), & col_start_old(dbcsr_nblkcols_total(matrix)), & col_end_old(dbcsr_nblkcols_total(matrix))) CALL convert_sizes_to_offsets(row_blk_size_old, & row_start_old, row_end_old) CALL convert_sizes_to_offsets(col_blk_size_old, & col_start_old, col_end_old) ! New matrix dist_new = dbcsr_distribution(redist) row_blk_size_new => array_data(redist%row_blk_size) col_blk_size_new => array_data(redist%col_blk_size) ALLOCATE (row_start_new(dbcsr_nblkrows_total(redist)), & row_end_new(dbcsr_nblkrows_total(redist)), & col_start_new(dbcsr_nblkcols_total(redist)), & col_end_new(dbcsr_nblkcols_total(redist))) CALL convert_sizes_to_offsets(row_blk_size_new, & row_start_new, row_end_new) CALL convert_sizes_to_offsets(col_blk_size_new, & col_start_new, col_end_new) row_dist_new => dbcsr_distribution_row_dist(dist_new) col_dist_new => dbcsr_distribution_col_dist(dist_new) ! Create mappings i = dbcsr_nfullrows_total(redist) ALLOCATE (row_reblocks(4, i)) ALLOCATE (n_row_reblocks(2, dbcsr_nblkrows_total(matrix))) CALL dbcsr_reblocking_targets(row_reblocks, i, n_row_reblocks, & row_blk_size_old, row_blk_size_new) i = dbcsr_nfullcols_total(redist) ALLOCATE (col_reblocks(4, i)) ALLOCATE (n_col_reblocks(2, dbcsr_nblkcols_total(matrix))) CALL dbcsr_reblocking_targets(col_reblocks, i, n_col_reblocks, & col_blk_size_old, col_blk_size_new) ! mp_obj_new = dbcsr_distribution_mp(dist_new) pgrid => dbcsr_mp_pgrid(mp_obj_new) numnodes = dbcsr_mp_numnodes(mp_obj_new) mp_group = dbcsr_mp_group(mp_obj_new) ! IF (MAXVAL(row_dist_new) > UBOUND(pgrid, 1)) & DBCSR_ABORT('Row distribution references unexistent processor rows') IF (dbg) THEN IF (MAXVAL(row_dist_new) .NE. UBOUND(pgrid, 1)) & DBCSR_WARN('Range of row distribution not equal to processor rows') END IF IF (MAXVAL(col_dist_new) > UBOUND(pgrid, 2)) & DBCSR_ABORT('Col distribution references unexistent processor cols') IF (dbg) THEN IF (MAXVAL(col_dist_new) .NE. UBOUND(pgrid, 2)) & DBCSR_WARN('Range of col distribution not equal to processor cols') END IF ALLOCATE (send_count(2, 0:numnodes - 1)) ALLOCATE (recv_count(2, 0:numnodes - 1)) ALLOCATE (total_send_count(2, 0:numnodes - 1)) ALLOCATE (total_recv_count(2, 0:numnodes - 1)) ALLOCATE (sdp(0:numnodes - 1)) ALLOCATE (sd_disp(0:numnodes - 1)) ALLOCATE (smp(0:numnodes - 1)) ALLOCATE (sm_disp(0:numnodes - 1)) ALLOCATE (rd_disp(0:numnodes - 1)) ALLOCATE (rm_disp(0:numnodes - 1)) IF (dbg) THEN cs1 = dbcsr_checksum(matrix) END IF !cs1 = dbcsr_checksum (matrix) !call dbcsr_print(matrix) ! ! ! Count initial sizes for sending. ! ! We go through every element of every local block and determine ! to which processor it must be sent. It could be more efficient, ! but at least the index data are run-length encoded. send_count(:, :) = 0 CALL dbcsr_iterator_start(iter, matrix) dst_p = -1 DO WHILE (dbcsr_iterator_blocks_left(iter)) CALL dbcsr_iterator_next_block(iter, row, col, blk) DO col_int = n_col_reblocks(1, col), & n_col_reblocks(1, col) + n_col_reblocks(2, col) - 1 blk_col_new = col_reblocks(1, col_int) DO row_int = n_row_reblocks(1, row), & n_row_reblocks(1, row) + n_row_reblocks(2, row) - 1 blk_row_new = row_reblocks(1, row_int) IF (.NOT. sym .OR. blk_col_new .GE. blk_row_new) THEN tr = .FALSE. CALL dbcsr_get_stored_coordinates(redist, & blk_row_new, blk_col_new, dst_p) send_count(1, dst_p) = send_count(1, dst_p) + 1 send_count(2, dst_p) = send_count(2, dst_p) + & col_reblocks(2, col_int)*row_reblocks(2, row_int) END IF END DO END DO END DO CALL dbcsr_iterator_stop(iter) ! ! CALL mp_alltoall(send_count, recv_count, 2, mp_group) ! Allocate data structures needed for data exchange. CALL dbcsr_data_init(recv_data) CALL dbcsr_data_new(recv_data, data_type, SUM(recv_count(2, :))) ALLOCATE (recv_meta(metalen*SUM(recv_count(1, :)))) CALL dbcsr_data_init(send_data) CALL dbcsr_data_new(send_data, data_type, SUM(send_count(2, :))) ALLOCATE (send_meta(metalen*SUM(send_count(1, :)))) ! Fill in the meta data structures and copy the data. DO dst_p = 0, numnodes - 1 total_send_count(1, dst_p) = send_count(1, dst_p) total_send_count(2, dst_p) = send_count(2, dst_p) total_recv_count(1, dst_p) = recv_count(1, dst_p) total_recv_count(2, dst_p) = recv_count(2, dst_p) END DO sd_disp = -1; sm_disp = -1 rd_disp = -1; rm_disp = -1 sd_disp(0) = 1; sm_disp(0) = 1 rd_disp(0) = 1; rm_disp(0) = 1 DO dst_p = 1, numnodes - 1 sm_disp(dst_p) = sm_disp(dst_p - 1) & + metalen*total_send_count(1, dst_p - 1) sd_disp(dst_p) = sd_disp(dst_p - 1) & + total_send_count(2, dst_p - 1) rm_disp(dst_p) = rm_disp(dst_p - 1) & + metalen*total_recv_count(1, dst_p - 1) rd_disp(dst_p) = rd_disp(dst_p - 1) & + total_recv_count(2, dst_p - 1) END DO sdp(:) = sd_disp ! sdp points to the the next place to store ! data. It is postincremented. smp(:) = sm_disp - metalen ! But smp points to the "working" data, not ! the next. It is pre-incremented, so we must ! first rewind it. ! CALL dbcsr_data_init(data_block) CALL dbcsr_data_new(data_block, data_type) CALL dbcsr_iterator_start(iter, matrix) dst_p = -1 DO WHILE (dbcsr_iterator_blocks_left(iter)) CALL dbcsr_iterator_next_block(iter, row, col, data_block, tr, blk, & row_size=row_size, col_size=col_size) !IF (tr) WRITE(*,*)"block at",row,col," is transposed" DO col_int = n_col_reblocks(1, col), & n_col_reblocks(1, col) + n_col_reblocks(2, col) - 1 blk_col_new = col_reblocks(1, col_int) DO row_int = n_row_reblocks(1, row), & n_row_reblocks(1, row) + n_row_reblocks(2, row) - 1 blk_row_new = row_reblocks(1, row_int) loc_ok: IF (.NOT. sym .OR. blk_col_new .GE. blk_row_new) THEN IF (dbg) & WRITE (*, *) 'using block', blk_row_new, 'x', blk_col_new ! Start a new RLE run tr = .FALSE. CALL dbcsr_get_stored_coordinates(redist, & blk_row_new, blk_col_new, dst_p) row_offset_old = row_reblocks(3, row_int) col_offset_old = col_reblocks(3, col_int) row_offset_new = row_reblocks(4, row_int) col_offset_new = col_reblocks(4, col_int) row_rle = row_reblocks(2, row_int) col_rle = col_reblocks(2, col_int) smp(dst_p) = smp(dst_p) + metalen send_meta(smp(dst_p)) = blk_row_new ! new blocked row send_meta(smp(dst_p) + 1) = blk_col_new ! new blocked column send_meta(smp(dst_p) + 2) = row_offset_new ! row in new block send_meta(smp(dst_p) + 3) = col_offset_new ! col in new block send_meta(smp(dst_p) + 4) = row_rle ! RLE rows send_meta(smp(dst_p) + 5) = col_rle ! RLE columns send_meta(smp(dst_p) + 6) = sdp(dst_p) - sd_disp(dst_p) ! Offset in data nze_rle = row_rle*col_rle ! Copy current block into the send buffer CALL dbcsr_block_partial_copy( & send_data, dst_offset=sdp(dst_p) - 1, & dst_rs=row_rle, dst_cs=col_rle, dst_tr=.FALSE., & dst_r_lb=1, dst_c_lb=1, & src=data_block, & src_rs=row_size, src_cs=col_size, src_tr=tr, & src_r_lb=row_offset_old, src_c_lb=col_offset_old, & nrow=row_rle, ncol=col_rle) sdp(dst_p) = sdp(dst_p) + nze_rle END IF loc_ok END DO ! row_int END DO ! col_int END DO CALL dbcsr_iterator_stop(iter) CALL dbcsr_data_clear_pointer(data_block) CALL dbcsr_data_release(data_block) ! Exchange the data and metadata structures. ! SELECT CASE (data_type) CASE (dbcsr_type_real_4) CALL hybrid_alltoall_s1( & send_data%d%r_sp(:), total_send_count(2, :), sd_disp(:) - 1, & recv_data%d%r_sp(:), total_recv_count(2, :), rd_disp(:) - 1, & mp_obj_new) CASE (dbcsr_type_real_8) !CALL mp_alltoall(& ! send_data%d%r_dp(:), total_send_count(2,:), sd_disp(:)-1,& ! recv_data%d%r_dp(:), total_recv_count(2,:), rd_disp(:)-1,& ! mp_group) CALL hybrid_alltoall_d1( & send_data%d%r_dp(:), total_send_count(2, :), sd_disp(:) - 1, & recv_data%d%r_dp(:), total_recv_count(2, :), rd_disp(:) - 1, & mp_obj_new) CASE (dbcsr_type_complex_4) CALL hybrid_alltoall_c1( & send_data%d%c_sp(:), total_send_count(2, :), sd_disp(:) - 1, & recv_data%d%c_sp(:), total_recv_count(2, :), rd_disp(:) - 1, & mp_obj_new) CASE (dbcsr_type_complex_8) CALL hybrid_alltoall_z1( & send_data%d%c_dp(:), total_send_count(2, :), sd_disp(:) - 1, & recv_data%d%c_dp(:), total_recv_count(2, :), rd_disp(:) - 1, & mp_obj_new) CASE default DBCSR_ABORT("Invalid matrix type") END SELECT CALL hybrid_alltoall_i1(send_meta(:), metalen*total_send_count(1, :), sm_disp(:) - 1, & recv_meta(:), metalen*total_recv_count(1, :), rm_disp(:) - 1, mp_obj_new) ! ! Now fill in the data. CALL dbcsr_work_create(redist, & nblks_guess=SUM(recv_count(1, :)), & sizedata_guess=SUM(recv_count(2, :)), work_mutable=.TRUE.) CALL dbcsr_data_init(buff_data) CALL dbcsr_data_init(data_block) CALL dbcsr_data_new(buff_data, dbcsr_type_1d_to_2d(data_type), & redist%max_rbs, redist%max_cbs) CALL dbcsr_data_new(data_block, dbcsr_type_1d_to_2d(data_type)) !blk_p = 1 !blk = 1 blk_ps = 0 blks = 0 cnt_fnd = 0; cnt_new = 0; cnt_skip = 0 DO src_p = 0, numnodes - 1 data_offset_l = rd_disp(src_p) DO meta_l = 1, recv_count(1, src_p) stored_row_new = recv_meta(rm_disp(src_p) + metalen*(meta_l - 1)) stored_col_new = recv_meta(rm_disp(src_p) + metalen*(meta_l - 1) + 1) row_offset_new = recv_meta(rm_disp(src_p) + metalen*(meta_l - 1) + 2) col_offset_new = recv_meta(rm_disp(src_p) + metalen*(meta_l - 1) + 3) row_rle = recv_meta(rm_disp(src_p) + metalen*(meta_l - 1) + 4) col_rle = recv_meta(rm_disp(src_p) + metalen*(meta_l - 1) + 5) data_offset_l = rd_disp(src_p) & + recv_meta(rm_disp(src_p) + metalen*(meta_l - 1) + 6) CALL dbcsr_data_clear_pointer(data_block) CALL dbcsr_get_block_p(redist, stored_row_new, stored_col_new, & data_block, tr, found) valid_block = found IF (found) cnt_fnd = cnt_fnd + 1 IF (.NOT. found .AND. .NOT. my_keep_sparsity) THEN ! We have to set up a buffer block CALL dbcsr_data_set_pointer(data_block, & rsize=row_blk_size_new(stored_row_new), & csize=col_blk_size_new(stored_col_new), & pointee=buff_data) CALL dbcsr_data_clear(data_block) !r2_dp => r2_dp_buff(1:row_blk_size_new (stored_row_new),& ! 1:col_blk_size_new (stored_col_new)) !r2_dp(:,:) = 0.0_dp tr = .FALSE. blks = blks + 1 blk_ps = blk_ps + row_blk_size_new(stored_row_new)* & col_blk_size_new(stored_col_new) valid_block = .TRUE. cnt_new = cnt_new + 1 END IF nze_rle = row_rle*col_rle IF (valid_block) THEN row_size_new = row_blk_size_new(stored_row_new) col_size_new = col_blk_size_new(stored_col_new) CALL dbcsr_block_partial_copy( & dst=data_block, dst_tr=tr, & dst_rs=row_size_new, dst_cs=col_size_new, & dst_r_lb=row_offset_new, dst_c_lb=col_offset_new, & src=recv_data, src_offset=data_offset_l - 1, & src_rs=row_rle, src_cs=col_rle, src_tr=.FALSE., & src_r_lb=1, src_c_lb=1, & nrow=row_rle, ncol=col_rle) ELSE cnt_skip = cnt_skip + 1 END IF data_offset_l = data_offset_l + nze_rle IF ((.NOT. found .OR. my_summation) .AND. valid_block) THEN IF (dbg) WRITE (*, *) routineN//" Adding new block at", & stored_row_new, stored_col_new CALL dbcsr_put_block(redist, stored_row_new, stored_col_new, & data_block, transposed=tr, summation=my_summation) !DEALLOCATE (r2_dp) ELSE IF (.NOT. my_keep_sparsity .AND. dbg) & WRITE (*, *) routineN//" Reusing block at", & stored_row_new, stored_col_new END IF END DO END DO CALL dbcsr_data_clear_pointer(data_block) CALL dbcsr_data_release(buff_data) CALL dbcsr_data_release(data_block) ! IF (dbg) THEN WRITE (*, *) routineN//" Declared blocks=", redist%wms(1)%lastblk, & "actual=", blks WRITE (*, *) routineN//" Declared data size=", redist%wms(1)%datasize, & "actual=", blk_ps END IF CALL dbcsr_finalize(redist) DEALLOCATE (send_count) DEALLOCATE (recv_count) DEALLOCATE (sdp); DEALLOCATE (sd_disp) DEALLOCATE (smp); DEALLOCATE (sm_disp) DEALLOCATE (rd_disp) DEALLOCATE (rm_disp) CALL dbcsr_data_release(recv_data) CALL dbcsr_data_release(send_data) DEALLOCATE (recv_meta) DEALLOCATE (send_meta) !if (dbg) call dbcsr_print(redist) IF (dbg) THEN cs2 = dbcsr_checksum(redist) WRITE (*, *) routineN//" Checksums=", cs1, cs2, cs1 - cs2 END IF !IF(cs1-cs2 > 0.00001) DBCSR_ABORT("Mangled data!") CALL timestop(handle) END SUBROUTINE dbcsr_complete_redistribute