# 1 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F" 1 !--------------------------------------------------------------------------------------------------! ! Copyright (C) by the DBCSR developers group - All rights reserved ! ! This file is part of the DBCSR library. ! ! ! ! For information on the license, see the LICENSE file. ! ! For further information please visit https://dbcsr.cp2k.org ! ! SPDX-License-Identifier: GPL-2.0+ ! !--------------------------------------------------------------------------------------------------! MODULE dbcsr_mm_cannon !! First layer of the dbcsr matrix-matrix multiplication. !! It performs the MPI parallelization according to Cannon's algorithm. !! <b>Modification history:</b> !! - 2010-02-23 Moved from dbcsr_operations !! - 2011-11 Moved parameter-stack processing routines to !! dbcsr_mm_methods. !! - 2013-01 reorganized code (Ole Schuett) USE dbcsr_acc_event, ONLY: acc_event_synchronize USE dbcsr_acc_device, ONLY: acc_device_synchronize USE dbcsr_array_types, ONLY: array_data, & array_exists, & array_i1d_obj, & array_size USE dbcsr_block_operations, ONLY: dbcsr_block_conjg, & dbcsr_block_copy_aa, & dbcsr_block_real_neg, & dbcsr_block_scale, & dbcsr_block_transpose_aa, & dbcsr_block_transpose USE dbcsr_config, ONLY: dbcsr_cfg, & has_acc USE dbcsr_data_methods, ONLY: & dbcsr_data_clear_pointer, dbcsr_data_ensure_size, dbcsr_data_get_size, & dbcsr_data_get_size_referenced, dbcsr_data_hold, dbcsr_data_host2dev, dbcsr_data_init, & dbcsr_data_new, dbcsr_data_release, dbcsr_data_set_pointer, & dbcsr_data_set_size_referenced, dbcsr_scalar_are_equal, dbcsr_scalar_negative, & dbcsr_scalar_one, dbcsr_get_data_p_s, dbcsr_get_data_p_d, dbcsr_get_data_p_c, dbcsr_get_data_p_z USE dbcsr_data_types, ONLY: dbcsr_datatype_sizeof USE dbcsr_dist_methods, ONLY: dbcsr_distribution_col_dist, & dbcsr_distribution_has_threads, & dbcsr_distribution_max_col_dist, & dbcsr_distribution_max_row_dist, & dbcsr_distribution_mp, & dbcsr_distribution_row_dist USE dbcsr_index_operations, ONLY: dbcsr_count_row_index, & dbcsr_has_local_row_index, & dbcsr_index_prune_deleted, & dbcsr_make_index_list, & dbcsr_make_index_local_row, & dbcsr_repoint_index USE dbcsr_io, ONLY: dbcsr_print USE dbcsr_iterator_operations, ONLY: dbcsr_iterator_blocks_left, & dbcsr_iterator_next_block, & dbcsr_iterator_start, & dbcsr_iterator_stop USE dbcsr_mem_methods, ONLY: dbcsr_mempool_limit_capacity USE dbcsr_methods, ONLY: & dbcsr_destroy_array, dbcsr_distribution, dbcsr_get_data_type, dbcsr_get_index_memory_type, & dbcsr_has_symmetry, dbcsr_image_dist_hold, dbcsr_image_dist_init, & dbcsr_image_dist_release, dbcsr_max_col_size, dbcsr_max_row_size, dbcsr_nblkcols_local, & dbcsr_nblkrows_local, dbcsr_nblkrows_total, dbcsr_release, dbcsr_valid_index USE dbcsr_mm_common, ONLY: & acc_transpose_blocks, calculate_norms, count_mpi_statistics, dbcsr_mm_multrec_type_p, & dbcsr_mpi_statistics, enumerate_blk_sizes, huge_norm, local_filter, max_memory, & memtype_abpanel_1, memtype_abpanel_2, memtype_mpi_buffer, memtype_trsbuffer_1, & memtype_trsbuffer_2, product_matrix_size_guess, rec_sort_index, setup_buffer_matrix USE dbcsr_mm_dist_operations, ONLY: dbcsr_get_local_vcols, & dbcsr_get_local_vrows, & dbcsr_reset_vlocals, & image_calculator USE dbcsr_mm_multrec, ONLY: dbcsr_mm_multrec_finalize, & dbcsr_mm_multrec_init, & dbcsr_mm_multrec_multiply USE dbcsr_mp_methods, ONLY: & dbcsr_mp_grid_setup, dbcsr_mp_group, dbcsr_mp_has_subgroups, dbcsr_mp_my_col_group, & dbcsr_mp_my_row_group, dbcsr_mp_mynode, dbcsr_mp_mypcol, dbcsr_mp_myprow, dbcsr_mp_npcols, & dbcsr_mp_nprows, dbcsr_mp_numnodes, dbcsr_mp_pgrid USE dbcsr_mp_operations, ONLY: dbcsr_irecv_any, & dbcsr_isend_any, & hybrid_alltoall_any, & hybrid_alltoall_i1 USE dbcsr_operations, ONLY: dbcsr_copy, & dbcsr_crop_matrix USE dbcsr_ptr_util, ONLY: ensure_array_size USE dbcsr_transformations, ONLY: dbcsr_make_dense_low USE dbcsr_types, ONLY: & dbcsr_2d_array_type, dbcsr_data_obj, dbcsr_distribution_obj, dbcsr_imagedistribution_obj, & dbcsr_iterator, dbcsr_memtype_type, dbcsr_meta_size, dbcsr_mp_obj, dbcsr_scalar_type, & dbcsr_slot_home_coli, dbcsr_slot_home_pcol, dbcsr_slot_home_prow, dbcsr_slot_home_rowi, & dbcsr_slot_home_vpcol, dbcsr_slot_home_vprow, dbcsr_slot_size, dbcsr_type, & dbcsr_type_complex_4, dbcsr_type_complex_8, dbcsr_type_int_4, dbcsr_type_no_symmetry, & dbcsr_type_real_4, dbcsr_type_real_8 USE dbcsr_dist_util, ONLY: count_bins, & dbcsr_checksum USE dbcsr_work_operations, ONLY: dbcsr_create, & dbcsr_special_finalize, & dbcsr_work_create USE dbcsr_kinds, ONLY: dp, & int_8, & real_4, & real_8, & sp USE dbcsr_machine, ONLY: default_output_unit, & m_memory USE dbcsr_mpiwrap, ONLY: mp_allgather, & mp_alltoall, & mp_irecv, & mp_isend, & mp_request_null, & mp_sum, & mp_testany, & mp_waitall, mp_comm_type, mp_request_type #include "base/dbcsr_base_uses.f90" !$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads IMPLICIT NONE PRIVATE CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'dbcsr_mm_cannon' LOGICAL, PARAMETER :: debug_mod = .FALSE. LOGICAL, PARAMETER :: careful_mod = .FALSE. INTERFACE dbcsr_switch MODULE PROCEDURE dbcsr_switch_sets MODULE PROCEDURE dbcsr_switch_d_ptrs END INTERFACE PUBLIC :: multiply_cannon, make_m2s CONTAINS SUBROUTINE make_m2s(matrix, m2s, rdist, dense_rdist, & !! Make images from the matrix (left or right) use_dense_mult, ab_dense, predistribute, & f_k, l_k, f_row, l_row, f_col, l_col, & row_blk_size, col_blk_size, & k_vmap, m_map, n_map, & alpha) TYPE(dbcsr_type), INTENT(IN) :: matrix TYPE(dbcsr_2d_array_type), INTENT(OUT), POINTER :: m2s TYPE(dbcsr_imagedistribution_obj), INTENT(INOUT) :: rdist, dense_rdist LOGICAL, INTENT(IN) :: use_dense_mult, ab_dense CHARACTER, INTENT(IN) :: predistribute INTEGER, INTENT(IN) :: f_k, l_k, f_row, l_row, f_col, l_col TYPE(array_i1d_obj), INTENT(INOUT) :: row_blk_size, col_blk_size TYPE(array_i1d_obj), INTENT(IN) :: k_vmap, m_map, n_map TYPE(dbcsr_scalar_type), INTENT(IN), OPTIONAL :: alpha CHARACTER(len=*), PARAMETER :: routineN = 'make_m2s' INTEGER :: handle, i, im, j, jm INTEGER, DIMENSION(4) :: f_crop LOGICAL :: do_scale, thread_redist TYPE(array_i1d_obj) :: col_map, row_map TYPE(dbcsr_type) :: dense_template, matrix_tmp CALL timeset(routineN, handle) ALLOCATE (m2s) do_scale = .FALSE. IF (PRESENT(alpha)) THEN IF (.NOT. dbcsr_scalar_are_equal(alpha, dbcsr_scalar_one(alpha%data_type))) THEN do_scale = .TRUE. END IF END IF IF (do_scale) THEN ! Copy and scale matrix if alpha is not 1. CALL dbcsr_make_images(matrix, m2s, rdist, & predistribute=predistribute, & no_copy_data=use_dense_mult, scale_value=alpha) ELSE CALL dbcsr_make_images(matrix, m2s, rdist, & predistribute=predistribute, & no_copy_data=use_dense_mult) END IF im = SIZE(m2s%mats, 1) jm = SIZE(m2s%mats, 2) SELECT CASE (predistribute) CASE ('L') f_crop = (/f_row, l_row, f_k, l_k/) row_map = m_map col_map = k_vmap thread_redist = .TRUE. CASE default f_crop = (/f_k, l_k, f_col, l_col/) row_map = k_vmap col_map = n_map thread_redist = .FALSE. END SELECT ! Post-processing of images. DO i = 1, im DO j = 1, jm CALL dbcsr_reset_vlocals(m2s%mats(i, j), rdist) ! Crop if necessary IF (ANY(f_crop .NE. 0)) THEN matrix_tmp = dbcsr_type() CALL dbcsr_crop_matrix(matrix_tmp, m2s%mats(i, j), & full_row_bounds=f_crop(1:2), & full_column_bounds=f_crop(3:4), & shallow_data=.FALSE.) CALL dbcsr_release(m2s%mats(i, j)) CALL dbcsr_copy(m2s%mats(i, j), matrix_tmp, shallow_data=.TRUE.) CALL dbcsr_release(matrix_tmp) CALL dbcsr_reset_vlocals(m2s%mats(i, j), rdist) END IF END DO END DO IF (ab_dense) THEN dense_template = dbcsr_type() CALL dbcsr_create(dense_template, template=matrix, & dist=dense_rdist%i%main, & row_blk_size_obj=row_blk_size, col_blk_size_obj=col_blk_size) CALL dbcsr_make_images_dense(m2s, dense_rdist, & row_map=row_map, col_map=col_map, & join_cols=use_dense_mult, join_rows=ab_dense, & new_template=dense_template) CALL dbcsr_image_dist_release(rdist) rdist = dense_rdist CALL dbcsr_image_dist_hold(rdist) DO i = 1, im DO j = 1, jm CALL dbcsr_reset_vlocals(m2s%mats(i, j), rdist) END DO END DO END IF DO i = 1, im DO j = 1, jm ! Convert to local-row index CALL dbcsr_make_index_local_row(m2s%mats(i, j)) ! Convert to list index CALL dbcsr_make_index_list(m2s%mats(i, j), thread_redist=thread_redist) END DO END DO IF (ab_dense) THEN CALL dbcsr_image_dist_release(dense_rdist) CALL dbcsr_release(dense_template) END IF CALL timestop(handle) END SUBROUTINE make_m2s SUBROUTINE dbcsr_make_images(source, normalized, target_image_dist, & predistribute, no_copy_data, scale_value) !! Creates row and column images of a matrix. TYPE(dbcsr_type), INTENT(IN) :: source !! input matrix TYPE(dbcsr_2d_array_type), INTENT(OUT) :: normalized !! image array of the normalized matrix TYPE(dbcsr_imagedistribution_obj), INTENT(IN) :: target_image_dist !! normalize to this image distribution CHARACTER, INTENT(IN), OPTIONAL :: predistribute !! predistribute data for multiplication LOGICAL, INTENT(IN), OPTIONAL :: no_copy_data !! try to not merge data at the end TYPE(dbcsr_scalar_type), INTENT(IN), OPTIONAL :: scale_value !! scale with this value ! --------------------------------------------------------------------------- IF (dbcsr_cfg%use_mpi_rma%val) & DBCSR_ABORT("RMA algo not supported here!") IF (.NOT. dbcsr_valid_index(source)) & DBCSR_ABORT("Matrix not initialized.") CALL make_images(source, normalized, & target_image_dist, desymmetrize=dbcsr_has_symmetry(source), & predistribute=predistribute, & no_copy_data=no_copy_data, & scale_value=scale_value) normalized%image_dist = target_image_dist CALL dbcsr_image_dist_hold(normalized%image_dist) END SUBROUTINE dbcsr_make_images SUBROUTINE make_images(ism, ums, target_imgdist, desymmetrize, predistribute, & no_copy_data, scale_value) !! Makes column-based and row-based images of a matrix. TYPE(dbcsr_type), INTENT(IN) :: ism !! input symmetric matrix TYPE(dbcsr_2d_array_type), INTENT(OUT) :: ums !! normalized matrices TYPE(dbcsr_imagedistribution_obj), INTENT(IN) :: target_imgdist !! image distribution to normalize to LOGICAL, INTENT(IN), OPTIONAL :: desymmetrize !! desymmetrize a symmetric matrix CHARACTER, INTENT(IN), OPTIONAL :: predistribute !! predistribute data for multiplication LOGICAL, INTENT(IN), OPTIONAL :: no_copy_data !! try to not merge data at the end TYPE(dbcsr_scalar_type), INTENT(IN), OPTIONAL :: scale_value !! scale with this value CHARACTER(len=*), PARAMETER :: routineN = 'make_images' INTEGER, PARAMETER :: metalen = 5 LOGICAL, PARAMETER :: dbg = .FALSE. CHARACTER :: predist_type, predist_type_fwd INTEGER :: blk, blk_l, blk_p, bp, col, col_img, col_size, coli, data_type, dst_p, handle, & handle2, ithread, ncol_images, nrow_images, nsymmetries, nthreads, numproc, & nze, pcol, prow, row, row_img, row_size, rowi, sd_pos, sm_pos, src_p, stored_blk_p, & stored_col, stored_row, symmetry_i, tr_col_size, tr_row_size, vcol, vrow INTEGER, ALLOCATABLE, DIMENSION(:) :: myt_sdp, myt_smp, rd_disp, recv_meta, & rm_disp, sd_disp, sdp, send_meta, & sm_disp, smp INTEGER, ALLOCATABLE, DIMENSION(:, :) :: all_total_send_offset, blk_ps, blks, & myt_total_send_count, & total_recv_count, total_send_count INTEGER, ALLOCATABLE, DIMENSION(:, :, :, :) :: myt_send_count, recv_count, send_count INTEGER, DIMENSION(:), CONTIGUOUS, POINTER :: col_blk_size, col_dist, col_img_dist, & row_blk_size, row_dist, row_img_dist INTEGER, DIMENSION(:, :), CONTIGUOUS, POINTER :: blacs2mpi LOGICAL :: nocopy, tr TYPE(dbcsr_data_obj) :: received_data_area, recv_data_area, & send_data_area TYPE(dbcsr_distribution_obj) :: old_dist, target_dist TYPE(dbcsr_iterator) :: iter TYPE(dbcsr_memtype_type) :: data_memory_type TYPE(dbcsr_mp_obj) :: mp_obj TYPE(dbcsr_scalar_type) :: scale_neg_one TYPE(dbcsr_type) :: sm TYPE(mp_comm_type) :: mp_group ! --------------------------------------------------------------------------- ! Check input matrix ! Set convenient variables to access input matrix info ! CALL timeset(routineN, handle) nocopy = .FALSE. IF (PRESENT(no_copy_data)) nocopy = no_copy_data sm = ism nsymmetries = 1 IF (PRESENT(desymmetrize)) THEN IF (desymmetrize .AND. sm%symmetry) THEN nsymmetries = 2 END IF END IF SELECT CASE (predistribute) CASE ('L', 'l') predist_type = 'L' predist_type_fwd = 'l' CASE ('R', 'r') predist_type = 'R' predist_type_fwd = 'r' CASE default DBCSR_ABORT("Incorrect pre-shift specifier.") END SELECT data_type = sm%data_type IF (data_type .NE. dbcsr_type_real_8 .AND. & data_type .NE. dbcsr_type_real_4 .AND. & data_type .NE. dbcsr_type_complex_8 .AND. & data_type .NE. dbcsr_type_complex_4) & DBCSR_ABORT("Invalid data type.") scale_neg_one = dbcsr_scalar_negative(dbcsr_scalar_one(data_type)) row_blk_size => array_data(sm%row_blk_size) col_blk_size => array_data(sm%col_blk_size) old_dist = dbcsr_distribution(ism) target_dist = target_imgdist%i%main row_dist => dbcsr_distribution_row_dist(target_dist) col_dist => dbcsr_distribution_col_dist(target_dist) IF (sm%symmetry) THEN IF (SIZE(row_dist) .NE. SIZE(col_dist)) & DBCSR_WARN('Unequal row and column distributions for symmetric matrix.') END IF nrow_images = target_imgdist%i%row_decimation IF (nrow_images .GT. 1) THEN row_img_dist => array_data(target_imgdist%i%row_image) ELSE NULLIFY (row_img_dist) END IF ncol_images = target_imgdist%i%col_decimation IF (ncol_images .GT. 1) THEN col_img_dist => array_data(target_imgdist%i%col_image) ELSE NULLIFY (col_img_dist) END IF mp_obj = dbcsr_distribution_mp(target_dist) blacs2mpi => dbcsr_mp_pgrid(mp_obj) numproc = dbcsr_mp_numnodes(mp_obj) mp_group = dbcsr_mp_group(mp_obj) IF (dbcsr_distribution_max_row_dist(old_dist) .GT. UBOUND(blacs2mpi, 1)) & DBCSR_ABORT('Row distribution references unexistent processor rows') IF (dbg) THEN IF (dbcsr_distribution_max_row_dist(old_dist) .NE. UBOUND(blacs2mpi, 1)) & DBCSR_WARN('Range of row distribution not equal to processor rows') END IF IF (dbcsr_distribution_max_col_dist(old_dist) .GT. UBOUND(blacs2mpi, 2)) & DBCSR_ABORT('Col distribution references unexistent processor cols') IF (dbg) THEN IF (dbcsr_distribution_max_col_dist(old_dist) .NE. UBOUND(blacs2mpi, 2)) & DBCSR_WARN('Range of col distribution not equal to processor cols') END IF ! Check threads configuration !$ IF (.NOT. dbcsr_distribution_has_threads(old_dist)) & !$ DBCSR_ABORT("Thread distribution not defined") ! Allocate shared temporary buffers ! ALLOCATE (send_count(2, nrow_images, ncol_images, 0:numproc - 1)); send_count = 0 ALLOCATE (recv_count(2, nrow_images, ncol_images, 0:numproc - 1)) ALLOCATE (total_send_count(2, 0:numproc - 1)); total_send_count = 0 ALLOCATE (total_recv_count(2, 0:numproc - 1)) ALLOCATE (sdp(0:numproc - 1)) ALLOCATE (smp(0:numproc - 1)) ALLOCATE (sd_disp(0:numproc - 1)); sd_disp(0) = 1 ALLOCATE (sm_disp(0:numproc - 1)); sm_disp(0) = 1 ALLOCATE (rd_disp(0:numproc - 1)); rd_disp(0) = 1 ALLOCATE (rm_disp(0:numproc - 1)); rm_disp(0) = 1 ALLOCATE (all_total_send_offset(2, 0:numproc - 1)) ALLOCATE (blk_ps(nrow_images, ncol_images)); blk_ps = 1 ALLOCATE (blks(nrow_images, ncol_images)); blks = 1 ! ! Allocate and init mats ! ALLOCATE (ums%mats(nrow_images, ncol_images)) data_memory_type = memtype_abpanel_1 DO row_img = 1, nrow_images DO col_img = 1, ncol_images ums%mats(row_img, col_img) = dbcsr_type() CALL dbcsr_create(ums%mats(row_img, col_img), "imaged "//sm%name, & target_dist, & dbcsr_type_no_symmetry, & row_blk_size_obj=sm%row_blk_size, col_blk_size_obj=sm%col_blk_size, & nze=0, data_type=data_type, & max_rbs=sm%max_rbs, max_cbs=sm%max_cbs, & row_blk_offset=sm%row_blk_offset, col_blk_offset=sm%col_blk_offset, & thread_dist=sm%dist, & data_memory_type=data_memory_type, & index_memory_type=memtype_mpi_buffer) ums%mats(row_img, col_img)%negate_real = sm%negate_real ums%mats(row_img, col_img)%negate_imaginary = sm%negate_imaginary END DO END DO nthreads = 1 !$OMP PARALLEL DEFAULT (NONE) & !$OMP PRIVATE (ithread, symmetry_i, row_img, col_img, & !$OMP myt_send_count, myt_total_send_count, & !$OMP iter, row, col, blk, row_size, col_size, stored_row, stored_col, & !$OMP prow, pcol, rowi, coli, vrow, vcol, dst_p, nze, myt_smp, myt_sdp, & !$OMP blk_p, bp, sd_pos, sm_pos,tr, & !$OMP tr_row_size, tr_col_size) & !$OMP SHARED (nthreads, nsymmetries, row_img_dist, col_img_dist, & !$OMP nrow_images, ncol_images, numproc, scale_value, & !$OMP ums, sm, ism, target_imgdist, row_dist, col_dist,& !$OMP predist_type_fwd, blacs2mpi, row_blk_size, col_blk_size, & !$OMP send_count, recv_count, handle2,mp_group, & !$OMP total_send_count, total_recv_count, recv_data_area, nocopy, & !$OMP data_type, recv_meta, send_data_area, send_meta, & !$OMP sd_disp, sm_disp, rd_disp, rm_disp, all_total_send_offset, blk_ps, blks, & !$OMP received_data_area, scale_neg_one, memtype_abpanel_1) ithread = 0 !$ ithread = omp_get_thread_num() !$OMP MASTER !$ nthreads = omp_get_num_threads() !$OMP END MASTER ! Allocate thread private data ! ALLOCATE (myt_send_count(2, nrow_images, ncol_images, 0:numproc - 1)); myt_send_count(:, :, :, :) = 0 ALLOCATE (myt_total_send_count(2, 0:numproc - 1)) ! Thread-local pointers of the current adding position into the send buffers ALLOCATE (myt_smp(0:numproc - 1), myt_sdp(0:numproc - 1)) ! Count sizes for sending ! CALL dbcsr_iterator_start(iter, ism, shared=.TRUE.) DO WHILE (dbcsr_iterator_blocks_left(iter)) CALL dbcsr_iterator_next_block(iter, row, col, blk, & row_size=row_size, col_size=col_size) nze = row_size*col_size IF (nze .EQ. 0) CYCLE DO symmetry_i = 1, nsymmetries IF (symmetry_i .EQ. 1) THEN stored_row = row; stored_col = col ELSE IF (row .EQ. col) CYCLE stored_row = col; stored_col = row END IF ! Where do we send this block? row_img = 1 IF (nrow_images .GT. 1) row_img = row_img_dist(stored_row) col_img = 1 IF (ncol_images .GT. 1) col_img = col_img_dist(stored_col) CALL image_calculator(target_imgdist, & prow=prow, rowi=rowi, & pcol=pcol, coli=coli, & vprow=vrow, vpcol=vcol, & myprow=row_dist(stored_row), myrowi=row_img, & mypcol=col_dist(stored_col), mycoli=col_img, & shifting=predist_type_fwd) dst_p = blacs2mpi(prow, pcol) ! These counts are meant for the thread that processes this row. myt_send_count(1, rowi, coli, dst_p) = & myt_send_count(1, rowi, coli, dst_p) + 1 myt_send_count(2, rowi, coli, dst_p) = & myt_send_count(2, rowi, coli, dst_p) + nze END DO ! symmetry_i END DO CALL dbcsr_iterator_stop(iter) DO dst_p = 0, numproc - 1 myt_total_send_count(1, dst_p) = SUM(myt_send_count(1, :, :, dst_p)) myt_total_send_count(2, dst_p) = SUM(myt_send_count(2, :, :, dst_p)) END DO ! Merge the send counts !$OMP CRITICAL send_count(:, :, :, :) = send_count(:, :, :, :) + myt_send_count(:, :, :, :) total_send_count(:, :) = total_send_count(:, :) + myt_total_send_count(:, :) !$OMP END CRITICAL !$OMP BARRIER !$OMP MASTER CALL timeset(routineN//"_sizes", handle2) CALL mp_alltoall(send_count, recv_count, 2*nrow_images*ncol_images, & mp_group) CALL timestop(handle2) !$OMP END MASTER !$OMP BARRIER ! Fill in the meta data structures and copy the data. !$OMP DO DO dst_p = 0, numproc - 1 total_recv_count(1, dst_p) = SUM(recv_count(1, :, :, dst_p)) total_recv_count(2, dst_p) = SUM(recv_count(2, :, :, dst_p)) END DO !$OMP MASTER ! Allocate data structures needed for data exchange. CALL dbcsr_data_init(recv_data_area) CALL dbcsr_data_init(send_data_area) IF (nrow_images .EQ. 1 .AND. ncol_images .EQ. 1 .OR. nocopy) THEN ! For some cases the faster dbcsr_special_finalize(reshuffle=.FALSE.) can be used. ! This basically makes this working matrix the actual data-area. ! Hence, for those cases we have to use data_memory_type already here. CALL dbcsr_data_new(recv_data_area, data_type, SUM(total_recv_count(2, :)), memory_type=memtype_abpanel_1) ! Some MPI implementations have high overhead when encountering a new buffer. ! Therefore we also use the memory pool for the send buffer. CALL dbcsr_data_new(send_data_area, data_type, SUM(total_send_count(2, :)), memory_type=memtype_abpanel_1) ELSE CALL dbcsr_data_new(recv_data_area, data_type, SUM(total_recv_count(2, :))) CALL dbcsr_data_new(send_data_area, data_type, SUM(total_send_count(2, :))) END IF ALLOCATE (recv_meta(metalen*SUM(total_recv_count(1, :)))) ALLOCATE (send_meta(metalen*SUM(total_send_count(1, :)))) ! Calculate displacements for processors needed for the exchanges. DO dst_p = 1, numproc - 1 sm_disp(dst_p) = sm_disp(dst_p - 1) & + metalen*total_send_count(1, dst_p - 1) sd_disp(dst_p) = sd_disp(dst_p - 1) & + total_send_count(2, dst_p - 1) rm_disp(dst_p) = rm_disp(dst_p - 1) & + metalen*total_recv_count(1, dst_p - 1) rd_disp(dst_p) = rd_disp(dst_p - 1) & + total_recv_count(2, dst_p - 1) END DO myt_smp(:) = sm_disp(:) myt_sdp(:) = sd_disp(:) IF (nthreads .GT. 1) THEN all_total_send_offset(1, :) = myt_smp(:) + metalen*myt_total_send_count(1, :) all_total_send_offset(2, :) = myt_sdp(:) + myt_total_send_count(2, :) END IF !$OMP END MASTER !$OMP BARRIER IF (ithread .GT. 0) THEN !$OMP CRITICAL myt_smp(:) = all_total_send_offset(1, :) myt_sdp(:) = all_total_send_offset(2, :) all_total_send_offset(1, :) & = all_total_send_offset(1, :) + metalen*myt_total_send_count(1, :) all_total_send_offset(2, :) & = all_total_send_offset(2, :) + myt_total_send_count(2, :) !$OMP END CRITICAL ELSE CALL dbcsr_data_init(received_data_area) received_data_area = recv_data_area CALL dbcsr_data_hold(received_data_area) DO row_img = 1, nrow_images DO col_img = 1, ncol_images CALL dbcsr_work_create(ums%mats(row_img, col_img), & SUM(recv_count(1, row_img, col_img, :)), n=1) CALL dbcsr_data_hold(received_data_area) CALL dbcsr_data_release(ums%mats(row_img, col_img)%wms(1)%data_area) ums%mats(row_img, col_img)%wms(1)%data_area = received_data_area END DO END DO END IF !$OMP BARRIER ! Add timing call to the packing of the send buffers ! CALL timeset(routineN//"_pack", handle2) ! Copies metadata and actual data to be sent into the send buffers. CALL dbcsr_iterator_start(iter, ism, shared=.TRUE.) SELECT CASE (data_type) CASE (dbcsr_type_real_4) CALL prepare_buffers_s(sm%negate_real, sm%negate_imaginary, & iter, row, col, blk, blk_p, bp, & row_size, col_size, nze, nsymmetries, symmetry_i, & stored_row, stored_col, tr_row_size, tr_col_size, tr, & row_img, col_img, nrow_images, ncol_images, & row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, & target_imgdist, prow, pcol, rowi, coli, & row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, & sd_pos, myt_sdp, send_meta, sd_disp, & dbcsr_get_data_p_s(sm%data_area), & send_data_area, scale_neg_one, scale_value) CASE (dbcsr_type_real_8) CALL prepare_buffers_d(sm%negate_real, sm%negate_imaginary, & iter, row, col, blk, blk_p, bp, & row_size, col_size, nze, nsymmetries, symmetry_i, & stored_row, stored_col, tr_row_size, tr_col_size, tr, & row_img, col_img, nrow_images, ncol_images, & row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, & target_imgdist, prow, pcol, rowi, coli, & row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, & sd_pos, myt_sdp, send_meta, sd_disp, & dbcsr_get_data_p_d(sm%data_area), & send_data_area, scale_neg_one, scale_value) CASE (dbcsr_type_complex_4) CALL prepare_buffers_c(sm%negate_real, sm%negate_imaginary, & iter, row, col, blk, blk_p, bp, & row_size, col_size, nze, nsymmetries, symmetry_i, & stored_row, stored_col, tr_row_size, tr_col_size, tr, & row_img, col_img, nrow_images, ncol_images, & row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, & target_imgdist, prow, pcol, rowi, coli, & row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, & sd_pos, myt_sdp, send_meta, sd_disp, & dbcsr_get_data_p_c(sm%data_area), & send_data_area, scale_neg_one, scale_value) CASE (dbcsr_type_complex_8) CALL prepare_buffers_z(sm%negate_real, sm%negate_imaginary, & iter, row, col, blk, blk_p, bp, & row_size, col_size, nze, nsymmetries, symmetry_i, & stored_row, stored_col, tr_row_size, tr_col_size, tr, & row_img, col_img, nrow_images, ncol_images, & row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, & target_imgdist, prow, pcol, rowi, coli, & row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, & sd_pos, myt_sdp, send_meta, sd_disp, & dbcsr_get_data_p_z(sm%data_area), & send_data_area, scale_neg_one, scale_value) END SELECT CALL dbcsr_iterator_stop(iter) ! Deallocate thread private data ! DEALLOCATE (myt_send_count) DEALLOCATE (myt_total_send_count) DEALLOCATE (myt_smp, myt_sdp) CALL timestop(handle2) !$OMP END PARALLEL ! Exchange the data and metadata structures. In the interesting cases (square grids, row col distribution same), ! there are only very few processors that need to exchange data. ! The hybrid_alltoall deals with this by doing point to point communication CALL timeset(routineN//"_data", handle2) CALL hybrid_alltoall_any(send_data_area, total_send_count(2, :), sd_disp(:) - 1, & recv_data_area, total_recv_count(2, :), rd_disp(:) - 1, & mp_obj, & most_ptp=.TRUE., remainder_ptp=.TRUE., no_hybrid=.FALSE.) CALL hybrid_alltoall_i1( & send_meta(:), metalen*total_send_count(1, :), sm_disp(:) - 1, & recv_meta(:), metalen*total_recv_count(1, :), rm_disp(:) - 1, & most_ptp=.TRUE., remainder_ptp=.TRUE., no_hybrid=.FALSE., & mp_env=mp_obj) CALL timestop(handle2) ! Now create the work index and/or copy the relevant data from the ! receive buffer into the local indices. ! DO src_p = 0, numproc - 1 DO blk_l = 1, total_recv_count(1, src_p) stored_row = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1)) stored_col = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1) + 1) stored_blk_p = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1) + 2) row_img = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1) + 3) col_img = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1) + 4) nze = row_blk_size(stored_row)*col_blk_size(stored_col) blk = blks(row_img, col_img) blks(row_img, col_img) = blks(row_img, col_img) + 1 blk_ps(row_img, col_img) = blk_ps(row_img, col_img) + nze ums%mats(row_img, col_img)%wms(1)%row_i(blk) = stored_row ums%mats(row_img, col_img)%wms(1)%col_i(blk) = stored_col ums%mats(row_img, col_img)%wms(1)%blk_p(blk) = & SIGN(rd_disp(src_p) + ABS(stored_blk_p) - 1, stored_blk_p) END DO END DO ! Finalize the actual imaged matrices from the work matrices ! DO row_img = 1, nrow_images DO col_img = 1, ncol_images ums%mats(row_img, col_img)%wms(1)%lastblk = blks(row_img, col_img) - 1 ums%mats(row_img, col_img)%wms(1)%datasize = blk_ps(row_img, col_img) - 1 ! CALL dbcsr_data_set_size_referenced( & ums%mats(row_img, col_img)%wms(1)%data_area, & ums%mats(row_img, col_img)%wms(1)%datasize) IF (nrow_images .EQ. 1 .AND. ncol_images .EQ. 1 .OR. nocopy) THEN CALL dbcsr_special_finalize(ums%mats(row_img, col_img), reshuffle=.FALSE.) ELSE CALL dbcsr_special_finalize(ums%mats(row_img, col_img), reshuffle=.TRUE.) END IF ! Save the home process and image row and column CALL image_calculator(target_imgdist, & ums%mats(row_img, col_img)%index(dbcsr_slot_home_prow), & ums%mats(row_img, col_img)%index(dbcsr_slot_home_rowi), & ums%mats(row_img, col_img)%index(dbcsr_slot_home_pcol), & ums%mats(row_img, col_img)%index(dbcsr_slot_home_coli), & vprow=ums%mats(row_img, col_img)%index(dbcsr_slot_home_vprow), & vpcol=ums%mats(row_img, col_img)%index(dbcsr_slot_home_vpcol), & myrowi=row_img, mycoli=col_img, & shifting=predist_type) END DO END DO ! Deallocate shared temporary buffers ! DEALLOCATE (send_count, recv_count) DEALLOCATE (total_send_count, total_recv_count) DEALLOCATE (sdp, smp, sd_disp, sm_disp) DEALLOCATE (rd_disp, rm_disp) DEALLOCATE (all_total_send_offset) DEALLOCATE (blk_ps, blks) DEALLOCATE (recv_meta, send_meta) CALL dbcsr_data_release(send_data_area) CALL dbcsr_data_release(received_data_area) CALL dbcsr_data_release(recv_data_area) CALL timestop(handle) END SUBROUTINE make_images SUBROUTINE dbcsr_make_images_dense(images, new_rdist, & row_map, col_map, join_cols, join_rows, new_template) !! Makes dense matrices for the image matrices. !! @note Used for making matrices dense/undense TYPE(dbcsr_2d_array_type), INTENT(INOUT) :: images !! current (undense) matrix images, output is the dense matrix images TYPE(dbcsr_imagedistribution_obj), INTENT(INOUT) :: new_rdist !! the new image distribution for dense matrices TYPE(array_i1d_obj), INTENT(IN) :: row_map, col_map !! mapping of current (undense) rows to dense rows !! mapping of current (undense) columns to dense columns LOGICAL, INTENT(IN) :: join_cols, join_rows !! make columns dense, default is yes !! make rows dense, default is yes TYPE(dbcsr_type), INTENT(IN) :: new_template !! template dense matrix for creating image matrices CHARACTER(len=*), PARAMETER :: routineN = 'dbcsr_make_images_dense' LOGICAL, PARAMETER :: dbg = .FALSE. INTEGER :: handle, mat_col, mat_row, mat_vpcol, & mat_vprow INTEGER, DIMENSION(:), CONTIGUOUS, POINTER :: und_col_blk_offsets, und_row_blk_offsets INTEGER, DIMENSION(dbcsr_meta_size) :: old_meta REAL(kind=dp) :: cs TYPE(array_i1d_obj) :: dense_local_vcols, dense_local_vrows, & und_local_vcols, und_local_vrows TYPE(dbcsr_imagedistribution_obj) :: old_rdist TYPE(dbcsr_type) :: tmp_mat ! --------------------------------------------------------------------------- CALL timeset(routineN, handle) old_rdist = images%image_dist ! DO mat_row = 1, images%image_dist%i%row_decimation DO mat_col = 1, images%image_dist%i%col_decimation IF (dbg) THEN cs = dbcsr_checksum(images%mats(mat_row, mat_col)) WRITE (*, *) routineN//" cs pre", cs END IF mat_vprow = images%mats(mat_row, mat_col)%index(dbcsr_slot_home_vprow) mat_vpcol = images%mats(mat_row, mat_col)%index(dbcsr_slot_home_vpcol) und_row_blk_offsets => array_data(images%mats(mat_row, mat_col)%row_blk_offset) und_col_blk_offsets => array_data(images%mats(mat_row, mat_col)%col_blk_offset) CALL dbcsr_get_local_vrows(old_rdist, und_local_vrows, mat_vprow) CALL dbcsr_get_local_vcols(old_rdist, und_local_vcols, mat_vpcol) CALL dbcsr_get_local_vrows(new_rdist, dense_local_vrows, mat_vprow) CALL dbcsr_get_local_vcols(new_rdist, dense_local_vcols, mat_vpcol) ! The old matrix has to be remembered so it is copied to ! tmp_mat. old_meta(:) = images%mats(mat_row, mat_col)%index(1:dbcsr_meta_size) tmp_mat = dbcsr_type() tmp_mat = images%mats(mat_row, mat_col) images%mats(mat_row, mat_col) = dbcsr_type() CALL dbcsr_create(images%mats(mat_row, mat_col), template=new_template) images%mats(mat_row, mat_col)%index(dbcsr_slot_home_prow & :dbcsr_slot_home_vpcol) = & old_meta(dbcsr_slot_home_prow:dbcsr_slot_home_vpcol) CALL dbcsr_make_dense_low(tmp_mat, images%mats(mat_row, mat_col), & array_data(und_local_vrows), array_data(und_local_vcols), & und_row_blk_offsets, und_col_blk_offsets, & array_data(dense_local_vrows), & array_data(dense_local_vcols), & array_data(new_template%row_blk_offset), & array_data(new_template%col_blk_offset), & array_data(row_map), array_data(col_map), join_rows, join_cols) ! CALL dbcsr_index_prune_deleted(images%mats(mat_row, mat_col)) ! CALL dbcsr_release(tmp_mat) IF (dbg) THEN cs = dbcsr_checksum(images%mats(mat_row, mat_col)) WRITE (*, *) routineN//" cs pst", cs END IF END DO END DO CALL dbcsr_image_dist_release(images%image_dist) images%image_dist = new_rdist CALL dbcsr_image_dist_hold(images%image_dist) CALL timestop(handle) END SUBROUTINE dbcsr_make_images_dense SUBROUTINE multiply_cannon(left_set, right_set, product_matrix, & retain_sparsity, & filter_eps, flop, keep_product_data) !! Multiplies two DBCSR matrices TYPE(dbcsr_2d_array_type), POINTER :: left_set, right_set !! set of imaged left matrices !! set of imaged right matrices TYPE(dbcsr_type), INTENT(INOUT) :: product_matrix !! DBCSR product matrix LOGICAL, INTENT(IN), OPTIONAL :: retain_sparsity !! retain the sparsity of the existing product matrix; default is no REAL(kind=real_8), INTENT(in), OPTIONAL :: filter_eps INTEGER(KIND=int_8), INTENT(OUT) :: flop !! effective flop LOGICAL, INTENT(IN) :: keep_product_data CHARACTER(len=*), PARAMETER :: routineN = 'multiply_cannon' INTEGER, PARAMETER :: idata = 1, ileft = 0, imeta = 2, & iright = 2 INTEGER :: data_type, data_type_byte, handle, handle1, handle2, handle3, i, ithread, & left_col_image, left_col_mult, left_col_nimages, left_dst_icol, left_dst_irow, & left_dst_p, left_dst_pcol, left_dst_prow, left_dst_vcol, left_dst_vrow, left_max_nblks, & left_max_nze, left_myfirstvcol, left_myfirstvrow, left_mypcol, left_myprow, left_npcols, & left_nprows, left_recv_icol, left_recv_irow, left_recv_p, left_recv_pcol, left_recv_prow, & left_recv_vcol, left_recv_vrow, left_row_image, left_row_mult, left_row_nimages, & left_send_icol, left_send_irow, left_send_p, left_send_pcol, left_send_prow INTEGER :: left_send_vcol, left_send_vrow, left_src_icol, left_src_irow, left_src_p, & left_src_pcol, left_src_prow, left_src_vcol, left_src_vrow, metronome, min_nimages, & mynode, nblkrows_used, nsteps_k, nthreads, numnodes, nvirt_k, & output_unit, right_col_image, right_col_mult, right_col_nimages, right_dst_icol, & right_dst_irow, right_dst_p, right_dst_pcol, right_dst_prow, right_dst_vcol, & right_dst_vrow, right_max_nblks, right_max_nze, right_myfirstvcol, right_myfirstvrow, & right_mypcol, right_myprow, right_npcols, right_nprows, right_recv_icol, right_recv_irow INTEGER :: right_recv_p, right_recv_pcol, right_recv_prow, right_recv_vcol, right_recv_vrow, & right_row_image, right_row_mult, right_row_nimages, right_send_icol, right_send_irow, & right_send_p, right_send_pcol, right_send_prow, right_send_vcol, right_send_vrow, & right_src_icol, right_src_irow, right_src_p, right_src_pcol, right_src_prow, & right_src_vcol, right_src_vrow, row, size_guess, size_guess_init, stat, threads_finished, & threads_finished_read, v_ki, v_ki_left, v_ki_right INTEGER(KIND=int_8) :: flop_single, flop_total, mem INTEGER, ALLOCATABLE, DIMENSION(:) :: row_counts, total_row_counts INTEGER, ALLOCATABLE, DIMENSION(:, :, :) :: left_sizes, my_sizes, right_sizes INTEGER, ALLOCATABLE, DIMENSION(:, :, :, :) :: all_sizes INTEGER, DIMENSION(:), POINTER, CONTIGUOUS :: col_blk_sizes2enum, enum2col_blk_sizes, & enum2row_blk_sizes, & left_index_rp, left_index_sp, m_sizes, n_sizes, & right_index_rp, right_index_sp, & row_blk_sizes2enum INTEGER, DIMENSION(:), POINTER, CONTIGUOUS :: k_sizes INTEGER, DIMENSION(:, :), POINTER, CONTIGUOUS :: left_pgrid, product_pgrid, right_pgrid INTEGER, SAVE :: mult_id = 0 LOGICAL :: keep_sparsity, list_indexing, & otf_filtering REAL(kind=sp), ALLOCATABLE, DIMENSION(:) :: left_norms, right_norms, & row_max_epss REAL(kind=sp) :: filter_eps_sp TYPE(dbcsr_2d_array_type), POINTER :: left_buffer_2, left_buffer_calc, & left_buffer_comm, right_buffer_2, right_buffer_calc, right_buffer_comm TYPE(dbcsr_data_obj) :: left_data_rp, left_data_sp, & right_data_rp, right_data_sp TYPE(dbcsr_data_obj), POINTER :: trs_stackbuf_calc, & trs_stackbuf_comm TYPE(dbcsr_data_obj), TARGET :: trs_stackbuf_1, trs_stackbuf_2 TYPE(dbcsr_mm_multrec_type_p), DIMENSION(:), ALLOCATABLE :: multrec TYPE(dbcsr_mp_obj) :: left_mp_obj, product_mp_obj, & right_mp_obj TYPE(mp_comm_type) :: grp, mp_group TYPE(mp_request_type), DIMENSION(:), ALLOCATABLE :: left_data_rr, left_data_sr, left_index_rr, & left_index_sr, right_data_rr, right_data_sr, right_index_rr, right_index_sr ! --------------------------------------------------------------------------- CALL timeset(routineN, handle) NULLIFY (trs_stackbuf_calc, trs_stackbuf_comm) NULLIFY (row_blk_sizes2enum, enum2row_blk_sizes) NULLIFY (col_blk_sizes2enum, enum2col_blk_sizes) NULLIFY (k_sizes) ! ALLOCATE (left_buffer_2, right_buffer_2) mult_id = mult_id + 1 IF (PRESENT(retain_sparsity)) THEN keep_sparsity = retain_sparsity ELSE keep_sparsity = .FALSE. END IF otf_filtering = PRESENT(filter_eps) !$OMP PARALLEL DEFAULT (NONE) & !$OMP SHARED (multrec, nthreads, product_matrix) !$OMP MASTER nthreads = 1 !$ nthreads = OMP_GET_NUM_THREADS() IF (.NOT. ASSOCIATED(product_matrix%wms)) & DBCSR_ABORT("Work matrices do not exist") IF (SIZE(product_matrix%wms) .NE. nthreads) & DBCSR_ABORT("Work matrices not correctly sized.") ALLOCATE (multrec(0:nthreads - 1)) !$OMP END MASTER !$OMP END PARALLEL output_unit = default_output_unit flop_total = 0 ! Set up variables data_type = dbcsr_get_data_type(product_matrix) data_type_byte = dbcsr_datatype_sizeof(data_type) left_row_nimages = left_set%image_dist%i%row_decimation left_row_mult = left_set%image_dist%i%row_multiplicity left_col_nimages = left_set%image_dist%i%col_decimation left_col_mult = left_set%image_dist%i%col_multiplicity right_row_nimages = right_set%image_dist%i%row_decimation right_row_mult = right_set%image_dist%i%row_multiplicity right_col_nimages = right_set%image_dist%i%col_decimation right_col_mult = right_set%image_dist%i%col_multiplicity left_mp_obj = dbcsr_distribution_mp(left_set%image_dist%i%main) right_mp_obj = dbcsr_distribution_mp(right_set%image_dist%i%main) product_mp_obj = dbcsr_distribution_mp(product_matrix%dist) numnodes = dbcsr_mp_numnodes(product_mp_obj) mynode = dbcsr_mp_mynode(product_mp_obj) left_nprows = dbcsr_mp_nprows(left_mp_obj) left_npcols = dbcsr_mp_npcols(left_mp_obj) left_myprow = dbcsr_mp_myprow(left_mp_obj) left_mypcol = dbcsr_mp_mypcol(left_mp_obj) left_myfirstvrow = dbcsr_mp_myprow(left_mp_obj)*left_row_nimages left_myfirstvcol = dbcsr_mp_mypcol(left_mp_obj)*left_col_nimages right_nprows = dbcsr_mp_nprows(right_mp_obj) right_npcols = dbcsr_mp_npcols(right_mp_obj) right_myprow = dbcsr_mp_myprow(right_mp_obj) right_mypcol = dbcsr_mp_mypcol(right_mp_obj) right_myfirstvrow = dbcsr_mp_myprow(right_mp_obj)*right_row_nimages right_myfirstvcol = dbcsr_mp_mypcol(right_mp_obj)*right_col_nimages mp_group = dbcsr_mp_group(product_mp_obj) left_pgrid => dbcsr_mp_pgrid(left_mp_obj) right_pgrid => dbcsr_mp_pgrid(right_mp_obj) product_pgrid => dbcsr_mp_pgrid(product_mp_obj) CALL dbcsr_mp_grid_setup(product_mp_obj) CALL dbcsr_mp_grid_setup(left_mp_obj) CALL dbcsr_mp_grid_setup(right_mp_obj) ! ! Dummy checks ! left/right matching IF (left_col_nimages .NE. right_row_mult) & DBCSR_ABORT("Left/Right image mismatch") IF (left_col_mult .NE. right_row_nimages) & DBCSR_ABORT("Left/Right image mismatch") IF (left_col_nimages*left_npcols .NE. right_row_nimages*right_nprows) & DBCSR_ABORT("Left/Right total mismatch") ! product/left matching IF (left_row_mult*dbcsr_mp_nprows(product_mp_obj) .NE. left_row_nimages*left_nprows) & DBCSR_ABORT("Product/Left total mismatch") ! product/left matching IF (right_col_mult*dbcsr_mp_npcols(product_mp_obj) .NE. right_col_nimages*right_npcols) & DBCSR_ABORT("Product/Right total mismatch") ! Limitations IF (left_row_nimages .NE. 1) & DBCSR_ABORT("Product/Left matrix process grid mismatch") IF (left_row_mult .NE. 1) & DBCSR_ABORT("Product/Left matrix process grid mismatch") IF (right_col_nimages .NE. 1) & DBCSR_ABORT("Product/Right matrix process grid mismatch") IF (right_col_mult .NE. 1) & DBCSR_ABORT("Product/Right matrix process grid mismatch") dbcsr_mpi_statistics%nimages = MAX(dbcsr_mpi_statistics%nimages, left_row_nimages*left_col_nimages) dbcsr_mpi_statistics%nimages = MAX(dbcsr_mpi_statistics%nimages, right_row_nimages*right_col_nimages) ! ! Exchange size data ALLOCATE (my_sizes(4, MAX(left_row_nimages, right_row_nimages), & MAX(left_col_nimages, right_col_nimages))) my_sizes(:, :, :) = 0 DO left_row_image = 1, left_row_nimages DO left_col_image = 1, left_col_nimages my_sizes(idata + ileft, left_row_image, left_col_image) & = dbcsr_data_get_size_referenced( & left_set%mats(left_row_image, left_col_image)%data_area) my_sizes(imeta + ileft, left_row_image, left_col_image) = & left_set%mats(left_row_image, left_col_image)%index & (dbcsr_slot_size) END DO END DO DO right_row_image = 1, right_row_nimages DO right_col_image = 1, right_col_nimages my_sizes(idata + iright, right_row_image, right_col_image) & = dbcsr_data_get_size_referenced( & right_set%mats(right_row_image, right_col_image)%data_area) my_sizes(imeta + iright, right_row_image, right_col_image) = & right_set%mats(right_row_image, right_col_image)%index & (dbcsr_slot_size) END DO END DO ALLOCATE (all_sizes(4, LBOUND(my_sizes, 2):UBOUND(my_sizes, 2), & LBOUND(my_sizes, 3):UBOUND(my_sizes, 3), 0:numnodes - 1)) CALL mp_allgather(my_sizes, all_sizes, mp_group) ! ! Count the maximum possible multiplies per row for on-the-fly ! filtering. per_row_eps: IF (.NOT. otf_filtering) THEN ! These arrays must be valid when passed to called subroutines. ALLOCATE (left_norms(0), right_norms(0), row_max_epss(0), stat=stat) IF (stat .NE. 0) & DBCSR_ABORT("Could not allocate memory") ELSE IF (careful_mod) THEN IF (left_set%mats(1, 1)%bcsc) & DBCSR_ABORT("Can not do on-the-fly filtering with CSC-indexed matrices.") END IF IF (dbcsr_has_local_row_index(left_set%mats(1, 1))) THEN nblkrows_used = dbcsr_nblkrows_local(left_set%mats(1, 1)) ELSE nblkrows_used = dbcsr_nblkrows_total(left_set%mats(1, 1)) END IF ALLOCATE (row_max_epss(nblkrows_used), stat=stat) IF (stat .NE. 0) & DBCSR_ABORT("Could not allocate memory for left epsilons") ALLOCATE (row_counts(nblkrows_used), stat=stat) IF (stat .NE. 0) & DBCSR_ABORT("Could not allocate memory for left row counts") ! The summation could be done prow-locally but it would ! complicate the pre-row eps calculation. ALLOCATE (total_row_counts(nblkrows_used), stat=stat) IF (stat .NE. 0) & DBCSR_ABORT("Could not allocate memory for left row counts") ! Each prow member matrix (npcols * row_images) counts the ! blocks present in each of its rows. total_row_counts(:) = 0 DO left_row_image = 1, left_row_nimages DO left_col_image = 1, left_col_nimages list_indexing = & left_set%mats(left_row_image, left_col_image)%list_indexing IF (careful_mod) THEN IF (list_indexing) THEN IF ((left_set%mats(left_row_image, left_col_image)%nblks)*3 .NE. & SIZE(left_set%mats(left_row_image, left_col_image)%coo_l)) & DBCSR_ABORT("Row count mismatch") ELSE IF (nblkrows_used + 1 .NE. SIZE(left_set%mats(left_row_image, left_col_image)%row_p)) & DBCSR_ABORT("Row count mismatch") END IF END IF IF (list_indexing) THEN CALL count_bins( & left_set%mats(left_row_image, left_col_image)%nblks, & left_set%mats(left_row_image, left_col_image)%coo_l(1::3), & nblkrows_used, row_counts) ELSE CALL dbcsr_count_row_index( & left_set%mats(left_row_image, left_col_image)%row_p, & row_counts, nblkrows_used) END IF total_row_counts(:) = total_row_counts(:) & + row_counts(:) END DO END DO ! The counted blocks are then summed up CALL mp_sum(total_row_counts, dbcsr_mp_my_row_group(product_mp_obj)) ! and used to determine the maximum per-block epsilon. filter_eps_sp = REAL(filter_eps, KIND=KIND(row_max_epss)) !$OMP PARALLEL DO DEFAULT (NONE) & !$OMP SHARED(nblkrows_used,row_max_epss,filter_eps_sp,& !$OMP total_row_counts) DO row = 1, nblkrows_used row_max_epss(row) & = (filter_eps_sp & /REAL(MAX(1, total_row_counts(row)), KIND=KIND(row_max_epss)))**2 END DO !$OMP END PARALLEL DO ! DEALLOCATE (row_counts) DEALLOCATE (total_row_counts) END IF per_row_eps ! ! The main transfer loop goes through the virtual rows/columns. ! The number of steps may be smaller if the grid dimension is very ! non-optimal (both left column images and right row images are > ! 1). min_nimages = MIN(left_col_nimages, right_row_nimages) nvirt_k = left_npcols*left_col_nimages nsteps_k = nvirt_k/min_nimages ! ! Translate the all_sizes to account for pre-distribution. This ! is just done to simplify lookups. ALLOCATE (left_sizes(2, 0:left_nprows*left_row_nimages - 1, 0:nvirt_k - 1)) left_sizes = -1 DO left_src_vcol = 0, left_col_nimages*left_npcols - 1 DO left_src_vrow = 0, left_row_nimages*left_nprows - 1 ! Calculate what was shifted. The left_src_v{row,col} are ! the "source" rows/columns; the left_dst are the shifted ! targets where the data was placed in make_images. CALL image_calculator(left_set%image_dist, & prow=left_dst_prow, pcol=left_dst_pcol, & rowi=left_dst_irow, coli=left_dst_icol, & myvprow=left_src_vrow, myvpcol=left_src_vcol, & shifting='l') left_dst_p = left_pgrid(left_dst_prow, left_dst_pcol) left_sizes(idata, left_src_vrow, left_src_vcol) = & all_sizes( & idata + ileft, left_dst_irow, left_dst_icol, left_dst_p) left_sizes(imeta, left_src_vrow, left_src_vcol) = & all_sizes( & imeta + ileft, left_dst_irow, left_dst_icol, left_dst_p) END DO END DO ! ALLOCATE (right_sizes(2, 0:nvirt_k - 1, 0:right_npcols*right_col_nimages - 1)) right_sizes = -1 DO right_src_vcol = 0, right_col_nimages*right_npcols - 1 DO right_src_vrow = 0, right_row_nimages*right_nprows - 1 ! Calculate what was shifted. The right_src_v{row,col} are ! the "source" rows/columns; the right_dst are the shifted ! targets where the data was placed in make_images. CALL image_calculator(right_set%image_dist, & prow=right_dst_prow, pcol=right_dst_pcol, & rowi=right_dst_irow, coli=right_dst_icol, & myvprow=right_src_vrow, myvpcol=right_src_vcol, & shifting='r') right_dst_p = right_pgrid(right_dst_prow, right_dst_pcol) right_sizes(idata, right_src_vrow, right_src_vcol) = & all_sizes( & idata + iright, right_dst_irow, right_dst_icol, right_dst_p) right_sizes(imeta, right_src_vrow, right_src_vcol) = & all_sizes( & imeta + iright, right_dst_irow, right_dst_icol, right_dst_p) END DO END DO ! ! Setup product work areas left_max_nze = MAXVAL(all_sizes(idata + ileft, :, :, :)) left_max_nblks = MAXVAL(all_sizes(imeta + ileft, :, :, :)) right_max_nze = MAXVAL(all_sizes(idata + iright, :, :, :)) right_max_nblks = MAXVAL(all_sizes(imeta + iright, :, :, :)) !! ! Evaluate sizes for workspaces IF (.NOT. keep_sparsity) THEN IF (has_acc) THEN size_guess_init = product_matrix_size_guess(left_set%mats(1, 1), right_set%mats(1, 1), product_matrix, & left_max_nze, right_max_nze, & left_col_nimages, right_row_nimages, & nthreads) ELSE size_guess_init = 1 END IF END IF ithread = 0 !$OMP PARALLEL DEFAULT(NONE) & !$OMP PRIVATE (i, size_guess, ithread) & !$OMP SHARED (product_matrix, left_max_nze, right_max_nze) & !$OMP SHARED (left_set, right_set, & !$OMP left_col_nimages, right_row_nimages) & !$OMP SHARED (nthreads, keep_sparsity, mynode, size_guess_init) ! !$ ithread = OMP_GET_THREAD_NUM() ! The work arrays have to be setup (actually, not quite sure). i = ithread + 1 size_guess = product_matrix%wms(i)%datasize ! Should be minimal IF (.NOT. keep_sparsity) THEN size_guess = MAX(size_guess, size_guess_init) END IF CALL dbcsr_data_ensure_size(product_matrix%wms(i)%data_area, & size_guess) CALL dbcsr_data_set_size_referenced(product_matrix%wms(i)%data_area, & product_matrix%wms(i)%datasize) ! XXXXXXX a quick fix right now, allocation with size 1 might actually not be needed at all, ! but something expects this to be associated CALL ensure_array_size(product_matrix%wms(i)%row_i, ub=1) CALL ensure_array_size(product_matrix%wms(i)%col_i, ub=1) CALL ensure_array_size(product_matrix%wms(i)%blk_p, ub=1) !$OMP END PARALLEL ! update capacity of memory-pools, +1 for the dense case IF (ASSOCIATED(memtype_abpanel_1%pool)) & CALL dbcsr_mempool_limit_capacity(memtype_abpanel_1%pool, & capacity=left_row_mult*left_col_nimages + right_row_nimages*right_col_mult + 1) IF (ASSOCIATED(memtype_abpanel_2%pool)) & CALL dbcsr_mempool_limit_capacity(memtype_abpanel_2%pool, & capacity=left_row_mult*left_col_nimages + right_row_nimages*right_col_mult + 1) IF (has_acc) THEN ! enumerate the blocksizes to keep the following 2D-arrays small. CALL enumerate_blk_sizes(right_set%mats(1, 1)%row_blk_size%low%data, & dbcsr_max_row_size(right_set%mats(1, 1)), & row_blk_sizes2enum, enum2row_blk_sizes) CALL enumerate_blk_sizes(right_set%mats(1, 1)%col_blk_size%low%data, & dbcsr_max_col_size(right_set%mats(1, 1)), & col_blk_sizes2enum, enum2col_blk_sizes) END IF ! ! Setup the left buffer matrices ! CALL buffer_matrices_ensure_size(left_set, index_size=left_max_nblks, & data_size=left_max_nze) CALL setup_buffer_matrices(left_buffer_2, left_row_mult, left_col_nimages, & left_set%mats(1, 1), index_size=left_max_nblks, & data_size=left_max_nze) IF (otf_filtering) THEN ALLOCATE (left_norms(left_max_nblks), stat=stat) IF (stat .NE. 0) & DBCSR_ABORT("Could not allocate memory for left norms") IF (stat .NE. 0) otf_filtering = .FALSE. END IF left_buffer_calc => left_set left_buffer_comm => left_buffer_2 ALLOCATE (left_data_sr(left_col_nimages)) ALLOCATE (left_index_sr(left_col_nimages)) ALLOCATE (left_data_rr(left_col_nimages)) ALLOCATE (left_index_rr(left_col_nimages)) left_data_sr = mp_request_null left_data_rr = mp_request_null left_index_sr = mp_request_null left_index_rr = mp_request_null ! Setup buffers for right matrix CALL buffer_matrices_ensure_size(right_set, index_size=right_max_nblks, & data_size=right_max_nze) CALL setup_buffer_matrices(right_buffer_2, right_row_nimages, right_col_mult, & right_set%mats(1, 1), index_size=right_max_nblks, data_size=right_max_nze) IF (otf_filtering) THEN ALLOCATE (right_norms(right_max_nblks), stat=stat) IF (stat .NE. 0) & DBCSR_WARN("Could not allocate memory for right norms") IF (stat .NE. 0) otf_filtering = .FALSE. END IF right_buffer_calc => right_set right_buffer_comm => right_buffer_2 ALLOCATE (right_data_sr(right_row_nimages)) ALLOCATE (right_index_sr(right_row_nimages)) ALLOCATE (right_data_rr(right_row_nimages)) ALLOCATE (right_index_rr(right_row_nimages)) right_data_sr = mp_request_null right_data_rr = mp_request_null right_index_sr = mp_request_null right_index_rr = mp_request_null ! ALLOCATE (m_sizes(dbcsr_nblkrows_local(product_matrix))) CALL local_filter(array_data(product_matrix%row_blk_size), array_size(product_matrix%local_rows), & array_data(product_matrix%local_rows), m_sizes) ALLOCATE (n_sizes(dbcsr_nblkcols_local(product_matrix))) CALL local_filter(array_data(product_matrix%col_blk_size), array_size(product_matrix%local_cols), & array_data(product_matrix%local_cols), n_sizes) ! !$OMP PARALLEL & !$OMP DEFAULT (NONE) & !$OMP SHARED (left_buffer_comm, right_buffer_comm, product_matrix,& !$OMP keep_sparsity, filter_eps, row_max_epss, multrec, nthreads, & !$OMP right_data_sr, right_data_rr, left_data_sr, left_data_rr,& !$OMP right_index_sr, right_index_rr, left_index_sr, left_index_rr,& !$OMP m_sizes, n_sizes, keep_product_data), & !$OMP PRIVATE(ithread) ithread = 0 !$ ithread = OMP_GET_THREAD_NUM() ALLOCATE (multrec(ithread)%p) CALL dbcsr_mm_multrec_init(multrec(ithread)%p, & product=product_matrix, & keep_sparsity=keep_sparsity, & eps=filter_eps, & row_max_epss=row_max_epss, & block_estimate=MAX(product_matrix%nblks, & left_buffer_comm%mats(1, 1)%nblks, & right_buffer_comm%mats(1, 1)%nblks)/nthreads, & right_row_blk_size=array_data(right_buffer_comm%mats(1, 1)%row_blk_size), & m_sizes=m_sizes, n_sizes=n_sizes, & keep_product_data=keep_product_data) !$OMP END PARALLEL ! ! Setup indexing CALL setup_rec_index_2d(left_set, left_row_nimages, left_col_nimages) CALL setup_rec_index_2d(right_set, right_row_nimages, right_col_nimages) ! ! Setup the send/receive data pointers CALL dbcsr_data_init(left_data_sp) CALL dbcsr_data_init(left_data_rp) CALL dbcsr_data_init(right_data_sp) CALL dbcsr_data_init(right_data_rp) CALL dbcsr_data_new(left_data_sp, data_type) CALL dbcsr_data_new(left_data_rp, data_type) CALL dbcsr_data_new(right_data_sp, data_type) CALL dbcsr_data_new(right_data_rp, data_type) ! Setup transpose stackbuffers IF (has_acc) THEN CALL dbcsr_data_init(trs_stackbuf_1) CALL dbcsr_data_init(trs_stackbuf_2) CALL dbcsr_data_new(trs_stackbuf_1, data_type=dbcsr_type_int_4, & data_size=2*right_max_nblks, memory_type=memtype_trsbuffer_1) CALL dbcsr_data_new(trs_stackbuf_2, data_type=dbcsr_type_int_4, & data_size=2*right_max_nblks, memory_type=memtype_trsbuffer_2) trs_stackbuf_calc => trs_stackbuf_1 trs_stackbuf_comm => trs_stackbuf_2 END IF ! ! Reset indices for virtual images v_ki_right = 0 v_ki_left = 0 ! ! Here is the main loop. ! ! In the first loop iteration, the data is fetched from the ! sources. In the remaining iterations, the data are exchanged ! among neighbors. In the last loop only calculations take place. ! CALL timeset(routineN//"_loop", handle1) ! grouped_k_index: DO metronome = 0, nvirt_k - 1 ! Wait for right matrix transfer completion. Wait in all but ! the first loop iteration. CALL timeset(routineN//"_metrocomm1", handle2) wait_right: IF (v_ki_right .EQ. right_row_nimages) THEN ! Reset index v_ki_right = 0 IF (debug_mod) WRITE (*, '(1X,A)') routineN//" waiting for right" ! CALL mp_waitall(right_data_sr) CALL mp_waitall(right_data_rr) CALL mp_waitall(right_index_sr) CALL mp_waitall(right_index_rr) ! ! Repoint indices of right matrices DO v_ki = 0, right_row_nimages - 1 CALL dbcsr_repoint_index(right_buffer_calc%mats(v_ki + 1, 1)) right_buffer_calc%mats(v_ki + 1, 1)%valid = .TRUE. END DO END IF wait_right CALL timestop(handle2) ! ! Right matrix transfer. Transfer in all but the last loop ! iteration. xfer_right: IF (v_ki_right .EQ. 0 .AND. metronome + right_row_nimages .LT. nvirt_k) THEN DO v_ki = 0, right_row_nimages - 1 ! Calculate the process to send to. It's the virtual ! process row -min_nimages up (i.e., smaller row number) ! from me. CALL image_calculator(right_set%image_dist, & prow=right_send_prow, rowi=right_send_irow, & ! output pcol=right_send_pcol, coli=right_send_icol, & ! output vprow=right_send_vrow, vpcol=right_send_vcol, & ! output ! myvprow goes through all of my (process row) images myvprow=v_ki + right_myfirstvrow, & myvpcol=right_myfirstvcol, & ! nothing happens in the columns vprow_shift=-right_row_nimages, & shifting='0') ! Calculate which data I send. CALL image_calculator(right_set%image_dist, & prow=right_dst_prow, rowi=right_dst_irow, & pcol=right_dst_pcol, coli=right_dst_icol, & vprow=right_dst_vrow, vpcol=right_dst_vcol, & ! myvprows goes through all of my (process row) images myvprow=v_ki + right_myfirstvrow, & myvpcol=right_myfirstvcol, & ! nothing happens in the columns vprow_shift=metronome, & ! This is with relative shifting. shifting='R') right_dst_p = right_pgrid(right_dst_prow, right_dst_pcol) CALL dbcsr_data_set_pointer( & area=right_data_sp, & rsize=right_sizes(idata, right_dst_vrow, right_dst_vcol), & csize=1, & pointee=right_buffer_calc%mats(v_ki + 1, 1)%data_area) right_index_sp => right_buffer_calc%mats( & v_ki + 1, 1 & )%index(1: & right_sizes(imeta, right_dst_vrow, right_dst_vcol)) ! ! Calculate the process to receive from CALL image_calculator(right_set%image_dist, & prow=right_recv_prow, rowi=right_recv_irow, & pcol=right_recv_pcol, coli=right_recv_icol, & vprow=right_recv_vrow, vpcol=right_recv_vcol, & myvprow=v_ki + right_myfirstvrow, & myvpcol=right_myfirstvcol, & vprow_shift=+right_row_nimages, & ! just the opposite as "send to" shifting='0') ! Calculate which data I receive CALL image_calculator(right_set%image_dist, & prow=right_src_prow, rowi=right_src_irow, & pcol=right_src_pcol, coli=right_src_icol, & vprow=right_src_vrow, vpcol=right_src_vcol, & myvprow=v_ki + right_myfirstvrow, & myvpcol=right_myfirstvcol, & ! receive window moves with the metronome vprow_shift=metronome + right_row_nimages, & shifting='R') ! IF (has_acc) THEN CALL timeset(routineN//"_acc_sync_right", handle3) CALL acc_event_synchronize(right_buffer_comm%mats(v_ki + 1, 1)%data_area%d%acc_ready) CALL timestop(handle3) END IF right_src_p = right_pgrid(right_src_prow, right_src_pcol) CALL dbcsr_data_set_pointer( & area=right_data_rp, & rsize=right_sizes(idata, right_src_vrow, right_src_vcol), & csize=1, & pointee=right_buffer_comm%mats(v_ki + 1, 1)%data_area) right_index_rp => right_buffer_comm%mats( & v_ki + 1, 1 & )%index(1: & right_sizes(imeta, right_src_vrow, right_src_vcol)) ! right_send_p = right_pgrid(right_send_prow, right_send_pcol) right_recv_p = right_pgrid(right_recv_prow, right_recv_pcol) ! These are column-communicator relative IF (dbcsr_mp_has_subgroups(right_mp_obj)) THEN right_send_p = right_send_prow right_recv_p = right_recv_prow grp = dbcsr_mp_my_col_group(right_mp_obj) ELSE grp = dbcsr_mp_group(right_mp_obj) END IF ! CALL timeset(routineN//"_metrocomm2", handle2) CALL dbcsr_irecv_any(right_data_rp, right_recv_p, & grp, right_data_rr(v_ki + 1), tag=right_src_vrow) CALL mp_irecv(right_index_rp, right_recv_p, & grp, right_index_rr(v_ki + 1), tag=right_src_vrow) CALL dbcsr_isend_any(right_data_sp, right_send_p, & grp, right_data_sr(v_ki + 1), tag=right_dst_vrow) CALL mp_isend(right_index_sp, right_send_p, & grp, right_index_sr(v_ki + 1), tag=right_dst_vrow) dbcsr_mpi_statistics%nexchanged = dbcsr_mpi_statistics%nexchanged + 1 CALL count_mpi_statistics(dbcsr_mpi_statistics%data_size(1, :), & dbcsr_data_get_size(right_data_rp), & data_type_byte, & dbcsr_mpi_statistics%data_size_breakdown(:, :, 1)) CALL timestop(handle2) END DO END IF xfer_right ! ! Wait for left matrix transfer completion. Wait in all but ! the first loop iteration. CALL timeset(routineN//"_metrocomm3", handle2) wait_left: IF (v_ki_left .EQ. left_col_nimages) THEN ! Reset index v_ki_left = 0 IF (debug_mod) WRITE (*, '(1X,A)') routineN//" waiting for left" CALL mp_waitall(left_data_sr) CALL mp_waitall(left_data_rr) CALL mp_waitall(left_index_sr) CALL mp_waitall(left_index_rr) ! ! Repoint indices of left matrices DO v_ki = 0, left_col_nimages - 1 CALL dbcsr_repoint_index(left_buffer_calc%mats(1, v_ki + 1)) left_buffer_calc%mats(1, v_ki + 1)%valid = .TRUE. END DO END IF wait_left CALL timestop(handle2) ! ! Left matrix transfer. Transfer in all but the last processor images. xfer_left: IF (v_ki_left .EQ. 0 .AND. metronome + left_col_nimages .LT. nvirt_k) THEN DO v_ki = 0, left_col_nimages - 1 ! Calculate the process to send to. CALL image_calculator(left_set%image_dist, & prow=left_send_prow, rowi=left_send_irow, & ! output pcol=left_send_pcol, coli=left_send_icol, & ! output vprow=left_send_vrow, vpcol=left_send_vcol, & ! output myvprow=left_myfirstvrow, & ! nothing happens in the rows ! go through all my column images myvpcol=v_ki + left_myfirstvcol, & ! send to process left_col_nimages left in the grid vpcol_shift=-left_col_nimages, & shifting='0') ! Calculate which data I send. CALL image_calculator(left_set%image_dist, & prow=left_dst_prow, rowi=left_dst_irow, & pcol=left_dst_pcol, coli=left_dst_icol, & vprow=left_dst_vrow, vpcol=left_dst_vcol, & myvprow=left_myfirstvrow, & ! go through all my column images myvpcol=v_ki + left_myfirstvcol, & vpcol_shift=metronome, & ! This is with relative shifting. shifting='L') ! left_dst_p = left_pgrid(left_dst_prow, left_dst_pcol) CALL dbcsr_data_set_pointer( & area=left_data_sp, & rsize=left_sizes(idata, left_dst_vrow, left_dst_vcol), & csize=1, & pointee=left_buffer_calc%mats(1, v_ki + 1)%data_area) left_index_sp => left_buffer_calc%mats( & 1, v_ki + 1 & )%index(1: & left_sizes(imeta, left_dst_vrow, left_dst_vcol)) ! ! Calculate the process to receive from CALL image_calculator(left_set%image_dist, & prow=left_recv_prow, rowi=left_recv_irow, & pcol=left_recv_pcol, coli=left_recv_icol, & vprow=left_recv_vrow, vpcol=left_recv_vcol, & myvprow=left_myfirstvrow, & myvpcol=v_ki + left_myfirstvcol, & vpcol_shift=+left_col_nimages, & ! just the opposite as "send to" shifting='0') ! Calculate which data I receive CALL image_calculator(left_set%image_dist, & prow=left_src_prow, rowi=left_src_irow, & pcol=left_src_pcol, coli=left_src_icol, & vprow=left_src_vrow, vpcol=left_src_vcol, & myvprow=left_myfirstvrow, & myvpcol=v_ki + left_myfirstvcol, & ! receive window moves with the metronome vpcol_shift=metronome + left_col_nimages, & shifting='L') ! IF (has_acc) THEN CALL timeset(routineN//"_acc_sync_left", handle3) CALL acc_event_synchronize(left_buffer_comm%mats(1, v_ki + 1)%data_area%d%acc_ready) CALL timestop(handle3) END IF left_src_p = left_pgrid(left_src_prow, left_src_pcol) CALL dbcsr_data_set_pointer( & area=left_data_rp, & rsize=left_sizes(idata, left_src_vrow, left_src_vcol), & csize=1, & pointee=left_buffer_comm%mats(1, v_ki + 1)%data_area) left_index_rp => left_buffer_comm%mats( & 1, v_ki + 1 & )%index(1: & left_sizes(imeta, left_src_vrow, left_src_vcol)) ! left_send_p = left_pgrid(left_send_prow, left_send_pcol) left_recv_p = left_pgrid(left_recv_prow, left_recv_pcol) ! These are column-communicator relative IF (dbcsr_mp_has_subgroups(left_mp_obj)) THEN left_send_p = left_send_pcol left_recv_p = left_recv_pcol grp = dbcsr_mp_my_row_group(left_mp_obj) ELSE grp = dbcsr_mp_group(left_mp_obj) END IF ! CALL timeset(routineN//"_metrocomm4", handle2) CALL dbcsr_irecv_any(left_data_rp, left_recv_p, & grp, left_data_rr(v_ki + 1), tag=left_src_vcol) CALL mp_irecv(left_index_rp, left_recv_p, & grp, left_index_rr(v_ki + 1), tag=left_src_vcol) CALL dbcsr_isend_any(left_data_sp, left_send_p, & grp, left_data_sr(v_ki + 1), tag=left_dst_vcol) CALL mp_isend(left_index_sp, left_send_p, & grp, left_index_sr(v_ki + 1), tag=left_dst_vcol) dbcsr_mpi_statistics%nexchanged = dbcsr_mpi_statistics%nexchanged + 1 CALL count_mpi_statistics(dbcsr_mpi_statistics%data_size(2, :), & dbcsr_data_get_size(left_data_rp), & data_type_byte, & dbcsr_mpi_statistics%data_size_breakdown(:, :, 2)) CALL timestop(handle2) END DO END IF xfer_left ! ! Do multiplication v_ki_left = v_ki_left + 1 v_ki_right = v_ki_right + 1 IF (debug_mod) THEN CALL dbcsr_print(left_buffer_calc%mats(1, v_ki_left), nodata=.TRUE.) CALL dbcsr_print(right_buffer_calc%mats(v_ki_right, 1), nodata=.TRUE.) END IF ! ! from here the code for dbcsr_mm_driver_inner_init was taken ! IF (.FALSE.) WRITE (*, *) routineN//" TICK", metronome ! Since the right matrix is shifted vertically, the ! received data always has different notions of "local ! rows". Thus the local_rows and global_rows must be ! recalculated. CALL dbcsr_reset_vlocals(right_buffer_calc%mats(v_ki_right, 1), & right_set%image_dist) CALL dbcsr_reset_vlocals(left_buffer_calc%mats(1, v_ki_left), & left_set%image_dist) ! CALL ensure_array_size(k_sizes, ub=array_size(right_buffer_calc%mats(v_ki_right, 1)%local_rows)) CALL local_filter(array_data(right_buffer_calc%mats(v_ki_right, 1)%row_blk_size), & array_size(right_buffer_calc%mats(v_ki_right, 1)%local_rows), & array_data(right_buffer_calc%mats(v_ki_right, 1)%local_rows), & k_sizes) ! IF (has_acc) THEN CALL dbcsr_data_host2dev(left_buffer_calc%mats(1, v_ki_left)%data_area) CALL dbcsr_data_host2dev(right_buffer_calc%mats(v_ki_right, 1)%data_area) CALL acc_transpose_blocks(right_buffer_calc%mats(v_ki_right, 1), trs_stackbuf_calc, & k_sizes, n_sizes, & row_blk_sizes2enum, enum2row_blk_sizes, & col_blk_sizes2enum, enum2col_blk_sizes) END IF ! Sets the local right-matrix columns IF (otf_filtering) THEN left_norms(:) = huge_norm right_norms(:) = huge_norm CALL calculate_norms(right_buffer_calc%mats(v_ki_right, 1), & right_norms, k_sizes, n_sizes) CALL calculate_norms(left_buffer_calc%mats(1, v_ki_left), & left_norms, m_sizes, k_sizes) END IF ! Wait for left and right buffers transfer to device before proceeding IF (has_acc) THEN CALL timeset(routineN//"_sync_h2d", handle2) CALL acc_device_synchronize() CALL timestop(handle2) END IF ! flop_single = 0 threads_finished = 0 !$OMP PARALLEL DEFAULT (NONE) & !$OMP SHARED (left_buffer_calc, right_buffer_calc, & !$OMP v_ki_left, v_ki_right, handle2, handle3, & !$OMP product_matrix, multrec,& !$OMP filter_eps, right_norms, left_norms, row_max_epss, & !$OMP keep_sparsity,threads_finished, & !$OMP right_data_sr, right_data_rr, right_index_sr, right_index_rr, & !$OMP left_data_sr, left_data_rr, left_index_sr, left_index_rr, & !$OMP dbcsr_cfg, k_sizes, nvirt_k, metronome) & !$OMP PRIVATE (ithread,nthreads,threads_finished_read) & !$OMP REDUCTION (+: flop_single) ithread = 0; nthreads = 1 !$ ithread = omp_get_thread_num(); nthreads = omp_get_num_threads() CALL timeset(routineN//"_multrec", handle2) CALL dbcsr_mm_multrec_multiply(multrec(ithread)%p, & left=left_buffer_calc%mats(1, v_ki_left), & right=right_buffer_calc%mats(v_ki_right, 1), & flop=flop_single, & a_norms=left_norms, b_norms=right_norms, & k_sizes=k_sizes) IF (metronome == nvirt_k - 1) THEN CALL timeset(routineN//"_multrec_finalize", handle3) CALL dbcsr_mm_multrec_finalize(multrec(ithread)%p) DEALLOCATE (multrec(ithread)%p) CALL timestop(handle3) END IF !$OMP ATOMIC threads_finished = threads_finished + 1 IF (dbcsr_cfg%use_comm_thread%val .AND. (ithread .EQ. 0)) THEN DO ! requires OMP 3.1 (e.g. gcc >=4.7), for correctness, otherwise we keep fingers crossed #if defined _OPENMP && _OPENMP >= 200711 !$OMP ATOMIC READ #endif threads_finished_read = threads_finished IF (threads_finished_read .EQ. nthreads) EXIT CALL mp_testany(right_data_sr) CALL mp_testany(right_data_rr) CALL mp_testany(left_data_sr) CALL mp_testany(left_data_rr) CALL mp_testany(right_index_sr) CALL mp_testany(right_index_rr) CALL mp_testany(left_index_sr) CALL mp_testany(left_index_rr) END DO END IF !$OMP BARRIER CALL timestop(handle2) !$OMP END PARALLEL flop_total = flop_total + flop_single ! ! Move to the next images IF (v_ki_left .EQ. left_col_nimages) THEN CALL dbcsr_switch(left_buffer_calc, left_buffer_comm) END IF IF (v_ki_right .EQ. right_row_nimages) THEN CALL dbcsr_switch(right_buffer_calc, right_buffer_comm) CALL dbcsr_switch(trs_stackbuf_calc, trs_stackbuf_comm) END IF END DO grouped_k_index CALL timestop(handle1) CALL m_memory(mem) max_memory = MAX(max_memory, REAL(mem)) IF (has_acc) THEN CALL dbcsr_data_release(trs_stackbuf_1) CALL dbcsr_data_release(trs_stackbuf_2) DEALLOCATE (row_blk_sizes2enum, enum2row_blk_sizes) DEALLOCATE (col_blk_sizes2enum, enum2col_blk_sizes) END IF IF (ALLOCATED(right_norms)) THEN DEALLOCATE (right_norms) END IF IF (ALLOCATED(left_norms)) THEN DEALLOCATE (left_norms) END IF IF (ALLOCATED(row_max_epss)) THEN DEALLOCATE (row_max_epss) END IF ! CALL dbcsr_destroy_array(right_buffer_2) CALL dbcsr_destroy_array(left_buffer_2) DEALLOCATE (my_sizes) ! CALL dbcsr_data_clear_pointer(left_data_sp) CALL dbcsr_data_clear_pointer(left_data_rp) CALL dbcsr_data_clear_pointer(right_data_sp) CALL dbcsr_data_clear_pointer(right_data_rp) CALL dbcsr_data_release(left_data_sp) CALL dbcsr_data_release(left_data_rp) CALL dbcsr_data_release(right_data_sp) CALL dbcsr_data_release(right_data_rp) ! DEALLOCATE (left_data_rr, left_data_sr, left_index_rr, left_index_sr, & right_data_rr, right_data_sr, right_index_rr, right_index_sr) ! ! IF (debug_mod) THEN v_ki = 0 DO i = 1, SIZE(product_matrix%blk_p) v_ki = MAX(v_ki, ABS(product_matrix%blk_p(i))) END DO WRITE (*, *) routineN//" Actual final size", & LOG(REAL(dbcsr_data_get_size(product_matrix%data_area)))/LOG(10.0), & LOG(REAL(v_ki))/LOG(10.0) END IF ! flop = flop_total DEALLOCATE (left_buffer_2, right_buffer_2) DEALLOCATE (m_sizes, n_sizes) IF (ASSOCIATED(k_sizes)) DEALLOCATE (k_sizes) ! CALL timestop(handle) END SUBROUTINE multiply_cannon SUBROUTINE setup_buffer_matrices(buffer_set, buff_nrows, buff_ncols, & source_matrix, index_size, data_size) TYPE(dbcsr_2d_array_type), INTENT(OUT) :: buffer_set INTEGER, INTENT(IN) :: buff_nrows, buff_ncols TYPE(dbcsr_type), INTENT(IN) :: source_matrix INTEGER, INTENT(IN) :: index_size INTEGER, INTENT(IN), OPTIONAL :: data_size CHARACTER(len=*), PARAMETER :: routineN = 'setup_buffer_matrices' INTEGER :: col_image, handle, row_image ! --------------------------------------------------------------------------- CALL timeset(routineN, handle) CALL dbcsr_image_dist_init(buffer_set%image_dist) ALLOCATE (buffer_set%mats(buff_nrows, buff_ncols)) DO row_image = 1, buff_nrows DO col_image = 1, buff_ncols CALL setup_buffer_matrix(buffer_set%mats(row_image, col_image), & source_matrix, index_size, data_size=data_size, & data_memory_type=memtype_abpanel_2) END DO END DO IF (source_matrix%local_indexing .AND. careful_mod) THEN IF (.NOT. array_exists(source_matrix%local_rows)) & DBCSR_ABORT("Local rows should exist.") IF (.NOT. array_exists(source_matrix%global_rows)) & DBCSR_ABORT("Global rows should exist.") ! IF (.NOT. array_exists(source_matrix%local_cols)) & DBCSR_ABORT("Local cols should exist.") IF (.NOT. array_exists(source_matrix%global_cols)) & DBCSR_ABORT("Global cols should exist.") END IF CALL timestop(handle) END SUBROUTINE setup_buffer_matrices SUBROUTINE buffer_matrices_ensure_size(buffer_set, index_size, data_size) !! Enlarge left_set and right_set to hold any a/b-panel. !! left_set and right_set are created by make_images to hold the a/b-panels !! used for the initial cannon-tick. This routine ensures that these buffers !! can hold any a/b-panel occurring during a matrix multiply and makes them !! therefore suitable as buffers for the entire cannon algorithm. !! This saves memory since no separate buffers for the first cannon-tick !! have to be allocated. TYPE(dbcsr_2d_array_type), INTENT(INOUT) :: buffer_set INTEGER, INTENT(IN) :: index_size, data_size CHARACTER(len=*), PARAMETER :: routineN = 'buffer_matrices_ensure_size' INTEGER :: col_image, handle, row_image ! --------------------------------------------------------------------------- CALL timeset(routineN, handle) DO row_image = 1, SIZE(buffer_set%mats, 1) DO col_image = 1, SIZE(buffer_set%mats, 2) CALL dbcsr_data_ensure_size(buffer_set%mats(row_image, col_image)%data_area, & data_size) CALL ensure_array_size(buffer_set%mats(row_image, col_image)%index, & ub=index_size, & memory_type=dbcsr_get_index_memory_type(buffer_set%mats(row_image, col_image))) CALL dbcsr_repoint_index(buffer_set%mats(row_image, col_image)) END DO END DO CALL timestop(handle) END SUBROUTINE buffer_matrices_ensure_size SUBROUTINE setup_rec_index_2d(matrix_set, n_rows, n_cols) TYPE(dbcsr_2d_array_type), INTENT(INOUT) :: matrix_set INTEGER, INTENT(IN) :: n_rows, n_cols CHARACTER(len=*), PARAMETER :: routineN = 'setup_rec_index_2d' LOGICAL, PARAMETER :: dbg = .FALSE. INTEGER :: handle, i_col, i_row, t_f, t_l, t_size !$ INTEGER :: ithread LOGICAL :: thread_redist ! --------------------------------------------------------------------------- CALL timeset(routineN, handle) DO i_row = 1, n_rows DO i_col = 1, n_cols IF (.FALSE.) & CALL dbcsr_reset_vlocals(matrix_set%mats(i_row, i_col), & matrix_set%image_dist) IF (dbg) THEN WRITE (*, *) routineN//" m, n, size", & SIZE(matrix_set%mats(i_row, i_col)%coo_l), & dbcsr_nblkrows_local(matrix_set%mats(i_row, i_col)), & dbcsr_nblkcols_local(matrix_set%mats(i_row, i_col)) WRITE (*, '(3(1X,I7))') matrix_set%mats(i_row, i_col)%coo_l END IF IF (careful_mod) THEN IF (SIZE(matrix_set%mats(i_row, i_col)%coo_l) .NE. matrix_set%mats(i_row, i_col)%nblks*3) & DBCSR_ABORT("Block count mismatch.") END IF thread_redist = ASSOCIATED(matrix_set%mats(i_row, i_col)%thr_c) t_size = SIZE(matrix_set%mats(i_row, i_col)%coo_l)/3 t_f = 1 t_l = t_size !$OMP PARALLEL IF (thread_redist) DEFAULT (NONE) & !$OMP PRIVATE (ithread) & !$OMP FIRSTPRIVATE (t_f, t_l, t_size) & !$OMP SHARED (matrix_set, i_row, i_col, thread_redist) !$ ithread = OMP_GET_THREAD_NUM() !$ IF (thread_redist) THEN !$ t_f = matrix_set%mats(i_row, i_col)%thr_c(ithread + 1) + 1 !$ t_l = matrix_set%mats(i_row, i_col)%thr_c(ithread + 2) !$ END IF t_size = t_l - t_f + 1 !$OMP BARRIER IF (t_size .GT. 0) THEN CALL call_rec_sort_index( & dbcsr_nblkrows_local(matrix_set%mats(i_row, i_col)), & dbcsr_nblkcols_local(matrix_set%mats(i_row, i_col)), & t_size, & matrix_set%mats(i_row, i_col)%coo_l((t_f*3 - 2):(t_l*3))) END IF !$OMP END PARALLEL END DO END DO CALL timestop(handle) END SUBROUTINE setup_rec_index_2d SUBROUTINE call_rec_sort_index(m, n, nblks, idx) !! Used to thunk a call to rec_sort_index INTEGER, INTENT(IN) :: m, n, nblks INTEGER, DIMENSION(3, 1:nblks), INTENT(INOUT) :: idx ! --------------------------------------------------------------------------- CALL rec_sort_index(1, m, 1, n, nblks, idx, 0) END SUBROUTINE call_rec_sort_index SUBROUTINE dbcsr_switch_sets(set1p, set2p) !! Switches pointers between two matrix sets TYPE(dbcsr_2d_array_type), POINTER :: set1p, set2p TYPE(dbcsr_2d_array_type), POINTER :: tmp_set ! --------------------------------------------------------------------------- tmp_set => set1p set1p => set2p set2p => tmp_set END SUBROUTINE dbcsr_switch_sets SUBROUTINE dbcsr_switch_d_ptrs(area1p, area2p) !! Switches pointers between two data areas TYPE(dbcsr_data_obj), POINTER :: area1p, area2p TYPE(dbcsr_data_obj), POINTER :: tmp_p tmp_p => area1p area1p => area2p area2p => tmp_p END SUBROUTINE dbcsr_switch_d_ptrs # 1 "/__w/dbcsr/dbcsr/src/mm/../data/dbcsr.fypp" 1 # 9 "/__w/dbcsr/dbcsr/src/mm/../data/dbcsr.fypp" # 11 "/__w/dbcsr/dbcsr/src/mm/../data/dbcsr.fypp" # 169 "/__w/dbcsr/dbcsr/src/mm/../data/dbcsr.fypp" # 1926 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F" 2 # 1927 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F" ! ************************************************************************************************** SUBROUTINE prepare_buffers_d (negate_real, negate_imaginary, & iter, row, col, blk, blk_p, bp, & row_size, col_size, nze, nsymmetries, symmetry_i, & stored_row, stored_col, tr_row_size, tr_col_size, tr, & row_img, col_img, nrow_images, ncol_images, & row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, & target_imgdist, prow, pcol, rowi, coli, & row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, & sd_pos, myt_sdp, send_meta, sd_disp, & data_area, send_data_area, scale_neg_one, scale_value) !! Prepare buffers for multiplications LOGICAL, INTENT(IN) :: negate_real, negate_imaginary TYPE(dbcsr_iterator), INTENT(INOUT) :: iter INTEGER, INTENT(INOUT) :: row, col, blk, blk_p, row_size, col_size, & nze, bp, symmetry_i, & stored_row, stored_col, tr_row_size, tr_col_size, & row_img, col_img, prow, pcol, rowi, coli, & dst_p, sm_pos, sd_pos INTEGER, INTENT(IN) :: nsymmetries, nrow_images, ncol_images, metalen LOGICAL, INTENT(INOUT) :: tr INTEGER, DIMENSION(:), INTENT(IN), CONTIGUOUS, POINTER :: row_img_dist, col_img_dist, row_dist, col_dist INTEGER, DIMENSION(:), ALLOCATABLE, INTENT(IN) :: sd_disp INTEGER, DIMENSION(:), ALLOCATABLE, INTENT(INOUT) :: myt_smp, myt_sdp, send_meta TYPE(dbcsr_imagedistribution_obj), INTENT(IN) :: target_imgdist INTEGER, DIMENSION(:, :), INTENT(IN), CONTIGUOUS, POINTER :: blacs2mpi CHARACTER, INTENT(IN) :: predist_type_fwd REAL(kind=real_8), DIMENSION(:), INTENT(IN), CONTIGUOUS :: data_area TYPE(dbcsr_data_obj), INTENT(INOUT) :: send_data_area TYPE(dbcsr_scalar_type), INTENT(IN) :: scale_neg_one TYPE(dbcsr_scalar_type), INTENT(IN), OPTIONAL :: scale_value DO WHILE (dbcsr_iterator_blocks_left(iter)) CALL dbcsr_iterator_next_block(iter, row, col, blk, blk_p=blk_p, & row_size=row_size, col_size=col_size) nze = row_size*col_size IF (nze .EQ. 0) CYCLE bp = ABS(blk_p) DO symmetry_i = 1, nsymmetries IF (symmetry_i .EQ. 1) THEN stored_row = row; stored_col = col; tr = blk_p .LT. 0 tr_row_size = col_size; tr_col_size = row_size ELSE IF (row .EQ. col) CYCLE stored_row = col; stored_col = row; tr = blk_p .GT. 0 tr_row_size = row_size; tr_col_size = col_size END IF ! Where do we send this block? row_img = 1 IF (nrow_images .GT. 1) row_img = row_img_dist(stored_row) col_img = 1 IF (ncol_images .GT. 1) col_img = col_img_dist(stored_col) CALL image_calculator(target_imgdist, & prow=prow, rowi=rowi, & pcol=pcol, coli=coli, & myprow=row_dist(stored_row), myrowi=row_img, & mypcol=col_dist(stored_col), mycoli=col_img, & shifting=predist_type_fwd) dst_p = blacs2mpi(prow, pcol) sm_pos = myt_smp(dst_p) myt_smp(dst_p) = myt_smp(dst_p) + metalen sd_pos = myt_sdp(dst_p) myt_sdp(dst_p) = myt_sdp(dst_p) + nze IF (tr) THEN IF (PRESENT(scale_value)) THEN CALL dbcsr_block_transpose(send_data_area%d%r_dp (sd_pos:myt_sdp(dst_p) - 1), & data_area(bp:bp + nze - 1)*scale_value%r_dp, & tr_row_size, tr_col_size) ELSE CALL dbcsr_block_transpose(send_data_area%d%r_dp (sd_pos:myt_sdp(dst_p) - 1), & data_area(bp:bp + nze - 1), & tr_row_size, tr_col_size) END IF IF (negate_real .AND. negate_imaginary) THEN send_data_area%d%r_dp (sd_pos:myt_sdp(dst_p) - 1) = & send_data_area%d%r_dp (sd_pos:myt_sdp(dst_p) - 1)*scale_neg_one%r_dp ELSEIF (negate_real) THEN # 2006 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F" send_data_area%d%r_dp (sd_pos:myt_sdp(dst_p) - 1) = & -send_data_area%d%r_dp (sd_pos:myt_sdp(dst_p) - 1) # 2015 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F" ELSEIF (negate_imaginary) THEN # 2020 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F" END IF ELSE ! Copy the block IF (PRESENT(scale_value)) THEN send_data_area%d%r_dp (sd_pos:myt_sdp(dst_p) - 1) = & data_area(bp:bp + nze - 1)*scale_value%r_dp ELSE send_data_area%d%r_dp (sd_pos:myt_sdp(dst_p) - 1) = data_area(bp:bp + nze - 1) END IF END IF send_meta(sm_pos) = stored_row send_meta(sm_pos + 1) = stored_col send_meta(sm_pos + 2) = sd_pos - sd_disp(dst_p) + 1 send_meta(sm_pos + 3) = rowi send_meta(sm_pos + 4) = coli END DO END DO END SUBROUTINE prepare_buffers_d # 1927 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F" ! ************************************************************************************************** SUBROUTINE prepare_buffers_s (negate_real, negate_imaginary, & iter, row, col, blk, blk_p, bp, & row_size, col_size, nze, nsymmetries, symmetry_i, & stored_row, stored_col, tr_row_size, tr_col_size, tr, & row_img, col_img, nrow_images, ncol_images, & row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, & target_imgdist, prow, pcol, rowi, coli, & row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, & sd_pos, myt_sdp, send_meta, sd_disp, & data_area, send_data_area, scale_neg_one, scale_value) !! Prepare buffers for multiplications LOGICAL, INTENT(IN) :: negate_real, negate_imaginary TYPE(dbcsr_iterator), INTENT(INOUT) :: iter INTEGER, INTENT(INOUT) :: row, col, blk, blk_p, row_size, col_size, & nze, bp, symmetry_i, & stored_row, stored_col, tr_row_size, tr_col_size, & row_img, col_img, prow, pcol, rowi, coli, & dst_p, sm_pos, sd_pos INTEGER, INTENT(IN) :: nsymmetries, nrow_images, ncol_images, metalen LOGICAL, INTENT(INOUT) :: tr INTEGER, DIMENSION(:), INTENT(IN), CONTIGUOUS, POINTER :: row_img_dist, col_img_dist, row_dist, col_dist INTEGER, DIMENSION(:), ALLOCATABLE, INTENT(IN) :: sd_disp INTEGER, DIMENSION(:), ALLOCATABLE, INTENT(INOUT) :: myt_smp, myt_sdp, send_meta TYPE(dbcsr_imagedistribution_obj), INTENT(IN) :: target_imgdist INTEGER, DIMENSION(:, :), INTENT(IN), CONTIGUOUS, POINTER :: blacs2mpi CHARACTER, INTENT(IN) :: predist_type_fwd REAL(kind=real_4), DIMENSION(:), INTENT(IN), CONTIGUOUS :: data_area TYPE(dbcsr_data_obj), INTENT(INOUT) :: send_data_area TYPE(dbcsr_scalar_type), INTENT(IN) :: scale_neg_one TYPE(dbcsr_scalar_type), INTENT(IN), OPTIONAL :: scale_value DO WHILE (dbcsr_iterator_blocks_left(iter)) CALL dbcsr_iterator_next_block(iter, row, col, blk, blk_p=blk_p, & row_size=row_size, col_size=col_size) nze = row_size*col_size IF (nze .EQ. 0) CYCLE bp = ABS(blk_p) DO symmetry_i = 1, nsymmetries IF (symmetry_i .EQ. 1) THEN stored_row = row; stored_col = col; tr = blk_p .LT. 0 tr_row_size = col_size; tr_col_size = row_size ELSE IF (row .EQ. col) CYCLE stored_row = col; stored_col = row; tr = blk_p .GT. 0 tr_row_size = row_size; tr_col_size = col_size END IF ! Where do we send this block? row_img = 1 IF (nrow_images .GT. 1) row_img = row_img_dist(stored_row) col_img = 1 IF (ncol_images .GT. 1) col_img = col_img_dist(stored_col) CALL image_calculator(target_imgdist, & prow=prow, rowi=rowi, & pcol=pcol, coli=coli, & myprow=row_dist(stored_row), myrowi=row_img, & mypcol=col_dist(stored_col), mycoli=col_img, & shifting=predist_type_fwd) dst_p = blacs2mpi(prow, pcol) sm_pos = myt_smp(dst_p) myt_smp(dst_p) = myt_smp(dst_p) + metalen sd_pos = myt_sdp(dst_p) myt_sdp(dst_p) = myt_sdp(dst_p) + nze IF (tr) THEN IF (PRESENT(scale_value)) THEN CALL dbcsr_block_transpose(send_data_area%d%r_sp (sd_pos:myt_sdp(dst_p) - 1), & data_area(bp:bp + nze - 1)*scale_value%r_sp, & tr_row_size, tr_col_size) ELSE CALL dbcsr_block_transpose(send_data_area%d%r_sp (sd_pos:myt_sdp(dst_p) - 1), & data_area(bp:bp + nze - 1), & tr_row_size, tr_col_size) END IF IF (negate_real .AND. negate_imaginary) THEN send_data_area%d%r_sp (sd_pos:myt_sdp(dst_p) - 1) = & send_data_area%d%r_sp (sd_pos:myt_sdp(dst_p) - 1)*scale_neg_one%r_sp ELSEIF (negate_real) THEN # 2006 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F" send_data_area%d%r_sp (sd_pos:myt_sdp(dst_p) - 1) = & -send_data_area%d%r_sp (sd_pos:myt_sdp(dst_p) - 1) # 2015 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F" ELSEIF (negate_imaginary) THEN # 2020 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F" END IF ELSE ! Copy the block IF (PRESENT(scale_value)) THEN send_data_area%d%r_sp (sd_pos:myt_sdp(dst_p) - 1) = & data_area(bp:bp + nze - 1)*scale_value%r_sp ELSE send_data_area%d%r_sp (sd_pos:myt_sdp(dst_p) - 1) = data_area(bp:bp + nze - 1) END IF END IF send_meta(sm_pos) = stored_row send_meta(sm_pos + 1) = stored_col send_meta(sm_pos + 2) = sd_pos - sd_disp(dst_p) + 1 send_meta(sm_pos + 3) = rowi send_meta(sm_pos + 4) = coli END DO END DO END SUBROUTINE prepare_buffers_s # 1927 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F" ! ************************************************************************************************** SUBROUTINE prepare_buffers_z (negate_real, negate_imaginary, & iter, row, col, blk, blk_p, bp, & row_size, col_size, nze, nsymmetries, symmetry_i, & stored_row, stored_col, tr_row_size, tr_col_size, tr, & row_img, col_img, nrow_images, ncol_images, & row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, & target_imgdist, prow, pcol, rowi, coli, & row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, & sd_pos, myt_sdp, send_meta, sd_disp, & data_area, send_data_area, scale_neg_one, scale_value) !! Prepare buffers for multiplications LOGICAL, INTENT(IN) :: negate_real, negate_imaginary TYPE(dbcsr_iterator), INTENT(INOUT) :: iter INTEGER, INTENT(INOUT) :: row, col, blk, blk_p, row_size, col_size, & nze, bp, symmetry_i, & stored_row, stored_col, tr_row_size, tr_col_size, & row_img, col_img, prow, pcol, rowi, coli, & dst_p, sm_pos, sd_pos INTEGER, INTENT(IN) :: nsymmetries, nrow_images, ncol_images, metalen LOGICAL, INTENT(INOUT) :: tr INTEGER, DIMENSION(:), INTENT(IN), CONTIGUOUS, POINTER :: row_img_dist, col_img_dist, row_dist, col_dist INTEGER, DIMENSION(:), ALLOCATABLE, INTENT(IN) :: sd_disp INTEGER, DIMENSION(:), ALLOCATABLE, INTENT(INOUT) :: myt_smp, myt_sdp, send_meta TYPE(dbcsr_imagedistribution_obj), INTENT(IN) :: target_imgdist INTEGER, DIMENSION(:, :), INTENT(IN), CONTIGUOUS, POINTER :: blacs2mpi CHARACTER, INTENT(IN) :: predist_type_fwd COMPLEX(kind=real_8), DIMENSION(:), INTENT(IN), CONTIGUOUS :: data_area TYPE(dbcsr_data_obj), INTENT(INOUT) :: send_data_area TYPE(dbcsr_scalar_type), INTENT(IN) :: scale_neg_one TYPE(dbcsr_scalar_type), INTENT(IN), OPTIONAL :: scale_value DO WHILE (dbcsr_iterator_blocks_left(iter)) CALL dbcsr_iterator_next_block(iter, row, col, blk, blk_p=blk_p, & row_size=row_size, col_size=col_size) nze = row_size*col_size IF (nze .EQ. 0) CYCLE bp = ABS(blk_p) DO symmetry_i = 1, nsymmetries IF (symmetry_i .EQ. 1) THEN stored_row = row; stored_col = col; tr = blk_p .LT. 0 tr_row_size = col_size; tr_col_size = row_size ELSE IF (row .EQ. col) CYCLE stored_row = col; stored_col = row; tr = blk_p .GT. 0 tr_row_size = row_size; tr_col_size = col_size END IF ! Where do we send this block? row_img = 1 IF (nrow_images .GT. 1) row_img = row_img_dist(stored_row) col_img = 1 IF (ncol_images .GT. 1) col_img = col_img_dist(stored_col) CALL image_calculator(target_imgdist, & prow=prow, rowi=rowi, & pcol=pcol, coli=coli, & myprow=row_dist(stored_row), myrowi=row_img, & mypcol=col_dist(stored_col), mycoli=col_img, & shifting=predist_type_fwd) dst_p = blacs2mpi(prow, pcol) sm_pos = myt_smp(dst_p) myt_smp(dst_p) = myt_smp(dst_p) + metalen sd_pos = myt_sdp(dst_p) myt_sdp(dst_p) = myt_sdp(dst_p) + nze IF (tr) THEN IF (PRESENT(scale_value)) THEN CALL dbcsr_block_transpose(send_data_area%d%c_dp (sd_pos:myt_sdp(dst_p) - 1), & data_area(bp:bp + nze - 1)*scale_value%c_dp, & tr_row_size, tr_col_size) ELSE CALL dbcsr_block_transpose(send_data_area%d%c_dp (sd_pos:myt_sdp(dst_p) - 1), & data_area(bp:bp + nze - 1), & tr_row_size, tr_col_size) END IF IF (negate_real .AND. negate_imaginary) THEN send_data_area%d%c_dp (sd_pos:myt_sdp(dst_p) - 1) = & send_data_area%d%c_dp (sd_pos:myt_sdp(dst_p) - 1)*scale_neg_one%c_dp ELSEIF (negate_real) THEN # 2009 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F" send_data_area%d%c_dp (sd_pos:myt_sdp(dst_p) - 1) = & CMPLX( & -REAL(send_data_area%d%c_dp (sd_pos:myt_sdp(dst_p) - 1), KIND=real_8), & AIMAG(send_data_area%d%c_dp (sd_pos:myt_sdp(dst_p) - 1)), & KIND=real_8) # 2015 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F" ELSEIF (negate_imaginary) THEN # 2017 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F" send_data_area%d%c_dp (sd_pos:myt_sdp(dst_p) - 1) = & CONJG(send_data_area%d%c_dp (sd_pos:myt_sdp(dst_p) - 1)) # 2020 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F" END IF ELSE ! Copy the block IF (PRESENT(scale_value)) THEN send_data_area%d%c_dp (sd_pos:myt_sdp(dst_p) - 1) = & data_area(bp:bp + nze - 1)*scale_value%c_dp ELSE send_data_area%d%c_dp (sd_pos:myt_sdp(dst_p) - 1) = data_area(bp:bp + nze - 1) END IF END IF send_meta(sm_pos) = stored_row send_meta(sm_pos + 1) = stored_col send_meta(sm_pos + 2) = sd_pos - sd_disp(dst_p) + 1 send_meta(sm_pos + 3) = rowi send_meta(sm_pos + 4) = coli END DO END DO END SUBROUTINE prepare_buffers_z # 1927 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F" ! ************************************************************************************************** SUBROUTINE prepare_buffers_c (negate_real, negate_imaginary, & iter, row, col, blk, blk_p, bp, & row_size, col_size, nze, nsymmetries, symmetry_i, & stored_row, stored_col, tr_row_size, tr_col_size, tr, & row_img, col_img, nrow_images, ncol_images, & row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, & target_imgdist, prow, pcol, rowi, coli, & row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, & sd_pos, myt_sdp, send_meta, sd_disp, & data_area, send_data_area, scale_neg_one, scale_value) !! Prepare buffers for multiplications LOGICAL, INTENT(IN) :: negate_real, negate_imaginary TYPE(dbcsr_iterator), INTENT(INOUT) :: iter INTEGER, INTENT(INOUT) :: row, col, blk, blk_p, row_size, col_size, & nze, bp, symmetry_i, & stored_row, stored_col, tr_row_size, tr_col_size, & row_img, col_img, prow, pcol, rowi, coli, & dst_p, sm_pos, sd_pos INTEGER, INTENT(IN) :: nsymmetries, nrow_images, ncol_images, metalen LOGICAL, INTENT(INOUT) :: tr INTEGER, DIMENSION(:), INTENT(IN), CONTIGUOUS, POINTER :: row_img_dist, col_img_dist, row_dist, col_dist INTEGER, DIMENSION(:), ALLOCATABLE, INTENT(IN) :: sd_disp INTEGER, DIMENSION(:), ALLOCATABLE, INTENT(INOUT) :: myt_smp, myt_sdp, send_meta TYPE(dbcsr_imagedistribution_obj), INTENT(IN) :: target_imgdist INTEGER, DIMENSION(:, :), INTENT(IN), CONTIGUOUS, POINTER :: blacs2mpi CHARACTER, INTENT(IN) :: predist_type_fwd COMPLEX(kind=real_4), DIMENSION(:), INTENT(IN), CONTIGUOUS :: data_area TYPE(dbcsr_data_obj), INTENT(INOUT) :: send_data_area TYPE(dbcsr_scalar_type), INTENT(IN) :: scale_neg_one TYPE(dbcsr_scalar_type), INTENT(IN), OPTIONAL :: scale_value DO WHILE (dbcsr_iterator_blocks_left(iter)) CALL dbcsr_iterator_next_block(iter, row, col, blk, blk_p=blk_p, & row_size=row_size, col_size=col_size) nze = row_size*col_size IF (nze .EQ. 0) CYCLE bp = ABS(blk_p) DO symmetry_i = 1, nsymmetries IF (symmetry_i .EQ. 1) THEN stored_row = row; stored_col = col; tr = blk_p .LT. 0 tr_row_size = col_size; tr_col_size = row_size ELSE IF (row .EQ. col) CYCLE stored_row = col; stored_col = row; tr = blk_p .GT. 0 tr_row_size = row_size; tr_col_size = col_size END IF ! Where do we send this block? row_img = 1 IF (nrow_images .GT. 1) row_img = row_img_dist(stored_row) col_img = 1 IF (ncol_images .GT. 1) col_img = col_img_dist(stored_col) CALL image_calculator(target_imgdist, & prow=prow, rowi=rowi, & pcol=pcol, coli=coli, & myprow=row_dist(stored_row), myrowi=row_img, & mypcol=col_dist(stored_col), mycoli=col_img, & shifting=predist_type_fwd) dst_p = blacs2mpi(prow, pcol) sm_pos = myt_smp(dst_p) myt_smp(dst_p) = myt_smp(dst_p) + metalen sd_pos = myt_sdp(dst_p) myt_sdp(dst_p) = myt_sdp(dst_p) + nze IF (tr) THEN IF (PRESENT(scale_value)) THEN CALL dbcsr_block_transpose(send_data_area%d%c_sp (sd_pos:myt_sdp(dst_p) - 1), & data_area(bp:bp + nze - 1)*scale_value%c_sp, & tr_row_size, tr_col_size) ELSE CALL dbcsr_block_transpose(send_data_area%d%c_sp (sd_pos:myt_sdp(dst_p) - 1), & data_area(bp:bp + nze - 1), & tr_row_size, tr_col_size) END IF IF (negate_real .AND. negate_imaginary) THEN send_data_area%d%c_sp (sd_pos:myt_sdp(dst_p) - 1) = & send_data_area%d%c_sp (sd_pos:myt_sdp(dst_p) - 1)*scale_neg_one%c_sp ELSEIF (negate_real) THEN # 2009 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F" send_data_area%d%c_sp (sd_pos:myt_sdp(dst_p) - 1) = & CMPLX( & -REAL(send_data_area%d%c_sp (sd_pos:myt_sdp(dst_p) - 1), KIND=real_4), & AIMAG(send_data_area%d%c_sp (sd_pos:myt_sdp(dst_p) - 1)), & KIND=real_4) # 2015 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F" ELSEIF (negate_imaginary) THEN # 2017 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F" send_data_area%d%c_sp (sd_pos:myt_sdp(dst_p) - 1) = & CONJG(send_data_area%d%c_sp (sd_pos:myt_sdp(dst_p) - 1)) # 2020 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F" END IF ELSE ! Copy the block IF (PRESENT(scale_value)) THEN send_data_area%d%c_sp (sd_pos:myt_sdp(dst_p) - 1) = & data_area(bp:bp + nze - 1)*scale_value%c_sp ELSE send_data_area%d%c_sp (sd_pos:myt_sdp(dst_p) - 1) = data_area(bp:bp + nze - 1) END IF END IF send_meta(sm_pos) = stored_row send_meta(sm_pos + 1) = stored_col send_meta(sm_pos + 2) = sd_pos - sd_disp(dst_p) + 1 send_meta(sm_pos + 3) = rowi send_meta(sm_pos + 4) = coli END DO END DO END SUBROUTINE prepare_buffers_c # 2040 "/__w/dbcsr/dbcsr/src/mm/dbcsr_mm_cannon.F" END MODULE dbcsr_mm_cannon