Performs a multiplication of two dbcsr_type matrices, as C := alpha * op( A ) * op( B ) + beta * C.
Matrices m_a and m_b are multiplied into the m_c product matrix. If the dist2d parameter is not specified, then a new distribution_2d is determined for it.
Non-equal column dimensions of the right and product matrices The right and product matrix are allowed to have different (full) column dimensions. If they differ, there are certain peculiar behaviors, then the last_column is effectively set to the minimal of the two.
Beta scaling of the right product matrix If the effective last_column is less than the full column dimension of the product matrix, then the scaling of the product matrix with beta is limited to the submatrix specified by last_column.
Filtering The filter_eps parameter, if present, is used to filter the resulting matrix. The filtering criterion is whether the block-frobenius norm is less than the specified epsilon. One-the-fly filtering is done such that individual multiplications are skipped if the product of the frobenius norms of the left- and right-matrix blocks are less than the specified epsilon divided by the maximum number of possible multiplies in each row. In addition a final filtering is done as well with the same epsilon value.
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
character(len=1), | intent(in) | :: | transa |
specifies the form of op( A ) to be used in the matrix multiplication transa = 'N' or 'n', op( A ) = A. transa = 'T' or 't', op( A ) = transpose(A). transa = 'C' or 'c', op( A ) = transpose(conjg(A)). specifies the form of op( B ) to be used in the matrix multiplication transb = 'N' or 'n', op( B ) = B. transb = 'T' or 't', op( B ) = transpose(B). transb = 'C' or 'c', op( B ) = transpose(conjg(B)). |
||
character(len=1), | intent(in) | :: | transb |
specifies the form of op( A ) to be used in the matrix multiplication transa = 'N' or 'n', op( A ) = A. transa = 'T' or 't', op( A ) = transpose(A). transa = 'C' or 'c', op( A ) = transpose(conjg(A)). specifies the form of op( B ) to be used in the matrix multiplication transb = 'N' or 'n', op( B ) = B. transb = 'T' or 't', op( B ) = transpose(B). transb = 'C' or 'c', op( B ) = transpose(conjg(B)). |
||
type(dbcsr_scalar_type), | intent(in) | :: | alpha |
scaling of product |
||
type(dbcsr_type), | intent(in) | :: | matrix_a |
left BCSR matrix right BCSR matrix |
||
type(dbcsr_type), | intent(in) | :: | matrix_b |
left BCSR matrix right BCSR matrix |
||
type(dbcsr_scalar_type), | intent(in) | :: | beta |
scaling of existing data |
||
type(dbcsr_type), | intent(inout) | :: | matrix_c |
resulting BCSR product matrix. |
||
integer, | intent(in), | optional | :: | first_row |
first full row of limiting submatrix last full row of limiting submatrix first full column of limiting submatrix last full column of limiting submatrix first full column of limiting inner product last full column of limiting inner product |
|
integer, | intent(in), | optional | :: | last_row |
first full row of limiting submatrix last full row of limiting submatrix first full column of limiting submatrix last full column of limiting submatrix first full column of limiting inner product last full column of limiting inner product |
|
integer, | intent(in), | optional | :: | first_column |
first full row of limiting submatrix last full row of limiting submatrix first full column of limiting submatrix last full column of limiting submatrix first full column of limiting inner product last full column of limiting inner product |
|
integer, | intent(in), | optional | :: | last_column |
first full row of limiting submatrix last full row of limiting submatrix first full column of limiting submatrix last full column of limiting submatrix first full column of limiting inner product last full column of limiting inner product |
|
integer, | intent(in), | optional | :: | first_k |
first full row of limiting submatrix last full row of limiting submatrix first full column of limiting submatrix last full column of limiting submatrix first full column of limiting inner product last full column of limiting inner product |
|
integer, | intent(in), | optional | :: | last_k |
first full row of limiting submatrix last full row of limiting submatrix first full column of limiting submatrix last full column of limiting submatrix first full column of limiting inner product last full column of limiting inner product |
|
logical, | intent(in), | optional | :: | retain_sparsity |
enforce the sparsity pattern of the existing product matrix; default is no |
|
real(kind=real_8), | intent(in), | optional | :: | filter_eps |
Filtering of the matrix |
|
integer(kind=int_8), | intent(out), | optional | :: | flop |
effective flop |
SUBROUTINE dbcsr_multiply_generic(transa, transb, & alpha, matrix_a, matrix_b, beta, matrix_c, & first_row, last_row, first_column, last_column, first_k, last_k, & retain_sparsity, filter_eps, & flop) !! Performs a multiplication of two dbcsr_type matrices, !! as C := alpha * op( A ) * op( B ) + beta * C. !! !! Matrices m_a and m_b are multiplied into the m_c product matrix. If the !! dist2d parameter is not specified, then a new distribution_2d is !! determined for it. !! !! Non-equal column dimensions of the right and product matrices !! The right and product matrix are allowed to have different !! (full) column dimensions. If they differ, there are certain !! peculiar behaviors, then the last_column is effectively set to !! the minimal of the two. !! !! Beta scaling of the right product matrix !! If the effective last_column is less than the full column !! dimension of the product matrix, then the scaling of the !! product matrix with beta is limited to the submatrix specified !! by last_column. !! !! Filtering !! The filter_eps parameter, if present, is used to filter the !! resulting matrix. The filtering criterion is whether the !! block-frobenius norm is less than the specified epsilon. !! One-the-fly filtering is done such that individual !! multiplications are skipped if the product of the frobenius !! norms of the left- and right-matrix blocks are less than the !! specified epsilon divided by the maximum number of possible !! multiplies in each row. In addition a final filtering is done !! as well with the same epsilon value. CHARACTER(LEN=1), INTENT(IN) :: transa, transb !! specifies the form of op( A ) to be used in the matrix multiplication transa = 'N' or 'n', op( A ) = A. transa = 'T' or !! 't', op( A ) = transpose(A). transa = 'C' or 'c', op( A ) = transpose(conjg(A)). !! specifies the form of op( B ) to be used in the matrix multiplication transb = 'N' or 'n', op( B ) = B. transb = 'T' or !! 't', op( B ) = transpose(B). transb = 'C' or 'c', op( B ) = transpose(conjg(B)). TYPE(dbcsr_scalar_type), INTENT(IN) :: alpha !! scaling of product TYPE(dbcsr_type), INTENT(IN) :: matrix_a, matrix_b !! left BCSR matrix !! right BCSR matrix TYPE(dbcsr_scalar_type), INTENT(IN) :: beta !! scaling of existing data TYPE(dbcsr_type), INTENT(INOUT) :: matrix_c !! resulting BCSR product matrix. INTEGER, INTENT(IN), OPTIONAL :: first_row, last_row, first_column, & last_column, first_k, last_k !! first full row of limiting submatrix !! last full row of limiting submatrix !! first full column of limiting submatrix !! last full column of limiting submatrix !! first full column of limiting inner product !! last full column of limiting inner product LOGICAL, INTENT(IN), OPTIONAL :: retain_sparsity !! enforce the sparsity pattern of the existing product matrix; default is no REAL(KIND=real_8), INTENT(IN), OPTIONAL :: filter_eps !! Filtering of the matrix INTEGER(KIND=int_8), INTENT(OUT), OPTIONAL :: flop !! effective flop CHARACTER(len=*), PARAMETER :: routineN = 'dbcsr_multiply_generic' LOGICAL, PARAMETER :: dbg = .FALSE. REAL(real_8), PARAMETER :: make_dense_occ_thresh = 1.0_dp CHARACTER :: transa_l, transb_l INTEGER :: f_col, f_k, f_row, handle, handle2, ithread, l_col, l_k, l_row, & nimages_left_rows, nimages_match, nimages_right_cols, npcols, nprows, numnodes, & data_type, output_unit INTEGER(KIND=int_8) :: my_flop LOGICAL :: ab_dense, keep_product_data, keep_sparsity, product_reindex, release_tdist, & transpose_left, transpose_right, use_dense_mult, use_mempools, thread_dist_force REAL(KIND=dp) :: cs TYPE(array_i1d_obj) :: dense_col_sizes, dense_k_sizes, dense_row_sizes, k_vmap, m_map, & n_map, old_product_col_blk_offsets, old_product_col_blk_sizes, & old_product_row_blk_offsets, old_product_row_blk_sizes, & matrix_c_thread_dist TYPE(dbcsr_2d_array_type), POINTER :: m2s_left, m2s_right TYPE(dbcsr_distribution_obj) :: dense_product_distribution, & old_product_distribution TYPE(dbcsr_imagedistribution_obj) :: dense_rdist_left, dense_rdist_right, & rdist_left, rdist_right TYPE(dbcsr_mp_obj) :: mp_obj TYPE(dbcsr_type) :: matrix_left, matrix_right, product_matrix TYPE(mp_comm_type) :: comm CALL timeset(routineN, handle) IF (dbcsr_get_occupation(matrix_a) .GT. 1) & DBCSR_ABORT("Matrix A occupation > 1") IF (dbcsr_get_occupation(matrix_b) .GT. 1) & DBCSR_ABORT("Matrix B occupation > 1") IF (dbcsr_get_occupation(matrix_c) .GT. 1) & DBCSR_ABORT("Matrix C occupation > 1") CALL array_nullify(dense_k_sizes) CALL array_nullify(dense_col_sizes) CALL array_nullify(dense_row_sizes) ! Reset GPU errors IF (use_acc()) THEN CALL dbcsr_acc_clear_errors() END IF ! Check if RMA is used with OpenMPI, if so disabled it ! (OpenMPI has several bugs with RMA and it does not ! give any performance benefit) CALL check_openmpi_rma() use_mempools = dbcsr_cfg%use_mempools_cpu%val .OR. use_acc() ! setup driver-dependent memory-types and their memory-pools --------------- ! the ab_buffers are shared by all threads IF (use_acc()) THEN IF (.NOT. acc_stream_associated(stream_1)) THEN CALL acc_stream_create(stream_1, "MemCpy (odd ticks)") CALL acc_stream_create(stream_2, "MemCpy (even ticks)") END IF CALL dbcsr_memtype_setup(memtype_abpanel_1, has_pool=.TRUE., & acc_hostalloc=.TRUE., acc_devalloc=.TRUE., acc_stream=stream_1, & mpi=.TRUE., oversize_factor=default_resize_factor) CALL dbcsr_memtype_setup(memtype_abpanel_2, has_pool=.TRUE., & acc_hostalloc=.TRUE., acc_devalloc=.TRUE., acc_stream=stream_2, & mpi=.TRUE., oversize_factor=default_resize_factor) !TODO: ensure capacity 2/3? CALL dbcsr_memtype_setup(memtype_trsbuffer_1, has_pool=.TRUE., & acc_hostalloc=.TRUE., acc_devalloc=.TRUE., acc_stream=stream_1) CALL dbcsr_memtype_setup(memtype_trsbuffer_2, has_pool=.TRUE., & acc_hostalloc=.TRUE., acc_devalloc=.TRUE., acc_stream=stream_2) CALL dbcsr_memtype_setup(memtype_normsbuf, has_pool=.TRUE., & acc_hostalloc=.TRUE., acc_devalloc=.TRUE., acc_stream=stream_1) CALL dbcsr_memtype_setup(memtype_offsetsbuf, has_pool=.TRUE., & acc_hostalloc=.TRUE., acc_devalloc=.TRUE., acc_stream=stream_1) CALL dbcsr_memtype_setup(memtype_nelemsbuf, has_pool=.TRUE., & acc_hostalloc=.TRUE., acc_devalloc=.TRUE., acc_stream=stream_1) CALL dbcsr_mempool_limit_capacity(memtype_trsbuffer_1%pool, capacity=1) CALL dbcsr_mempool_limit_capacity(memtype_trsbuffer_2%pool, capacity=1) CALL dbcsr_mempool_limit_capacity(memtype_normsbuf%pool, capacity=1) CALL dbcsr_mempool_limit_capacity(memtype_offsetsbuf%pool, capacity=1) CALL dbcsr_mempool_limit_capacity(memtype_nelemsbuf%pool, capacity=1) END IF CALL dbcsr_memtype_setup(memtype_mpi_buffer, mpi=.TRUE.) CALL dbcsr_memtype_setup(memtype_mpi_product, mpi=.TRUE., has_pool=use_mempools) ! check parameters --------------------------------------------------------- transa_l = transa transb_l = transb CALL uppercase(transa_l) CALL uppercase(transb_l) IF (transa_l .NE. dbcsr_no_transpose .AND. & transa_l .NE. dbcsr_transpose .AND. & transa_l .NE. dbcsr_conjugate_transpose) & DBCSR_ABORT("Invalid transa_l = "//transa_l) IF (transb_l .NE. dbcsr_no_transpose .AND. & transb_l .NE. dbcsr_transpose .AND. & transb_l .NE. dbcsr_conjugate_transpose) & DBCSR_ABORT("Invalid transb_l = "//transb_l) IF (dbg) THEN WRITE (*, *) '========== MULTIPLICATION ========================' CALL dbcsr_verify_matrix(matrix_a) CALL dbcsr_verify_matrix(matrix_b) CALL dbcsr_verify_matrix(matrix_c) WRITE (*, *) routineN//" ABC checksums", & dbcsr_checksum(matrix_a), & dbcsr_checksum(matrix_b), & dbcsr_checksum(matrix_c) IF (dbg) THEN CALL dbcsr_print(matrix_a, nodata=.TRUE.) CALL dbcsr_print(matrix_b, nodata=.TRUE.) CALL dbcsr_print(matrix_c, nodata=.TRUE.) END IF END IF ! transpose/conjg left and/or right matrices if needed transpose_left = .FALSE. SELECT CASE (transa_l) CASE (dbcsr_no_transpose) matrix_left = matrix_a transpose_left = .FALSE. CASE (dbcsr_transpose) matrix_left = dbcsr_type() IF (dbcsr_get_matrix_type(matrix_a) .EQ. dbcsr_type_antisymmetric) THEN ! ! For antisymmetric matrix, we need to do a hard copy ! shallow_data_copy=.TRUE. does not handle properly antisymm matrices CALL dbcsr_new_transposed(matrix_left, matrix_a, & shallow_data_copy=.FALSE., redistribute=.FALSE., & transpose_distribution=.FALSE.) ELSE CALL dbcsr_new_transposed(matrix_left, matrix_a, & shallow_data_copy=.TRUE., redistribute=.FALSE., & transpose_distribution=.FALSE.) END IF transpose_left = .TRUE. CASE (dbcsr_conjugate_transpose) matrix_left = dbcsr_type() CALL dbcsr_new_transposed(matrix_left, matrix_a, & shallow_data_copy=.FALSE., redistribute=.FALSE., & transpose_distribution=.FALSE.) CALL dbcsr_conjg(matrix_left) transpose_left = .TRUE. CASE DEFAULT DBCSR_ABORT("wrong transa_l = "//transa_l) END SELECT transpose_right = .FALSE. SELECT CASE (transb_l) CASE (dbcsr_no_transpose) matrix_right = matrix_b transpose_right = .FALSE. CASE (dbcsr_transpose) matrix_right = dbcsr_type() IF (dbcsr_get_matrix_type(matrix_b) .EQ. dbcsr_type_antisymmetric) THEN ! ! For antisymmetric matrix, we need to do a hard copy ! shallow_data_copy=.TRUE. does not handle properly antisymm matrices CALL dbcsr_new_transposed(matrix_right, matrix_b, & shallow_data_copy=.FALSE., redistribute=.FALSE., & transpose_distribution=.FALSE.) ELSE CALL dbcsr_new_transposed(matrix_right, matrix_b, & shallow_data_copy=.TRUE., redistribute=.FALSE., & transpose_distribution=.FALSE.) END IF transpose_right = .TRUE. CASE (dbcsr_conjugate_transpose) matrix_right = dbcsr_type() CALL dbcsr_new_transposed(matrix_right, matrix_b, & shallow_data_copy=.FALSE., redistribute=.FALSE., & transpose_distribution=.FALSE.) CALL dbcsr_conjg(matrix_right) transpose_right = .TRUE. CASE DEFAULT DBCSR_ABORT("wrong transb_l = "//transb_l) END SELECT ! ! Ensure matrix compatibility. IF (.NOT. array_equality(matrix_c%row_blk_offset, matrix_left%row_blk_offset)) & DBCSR_ABORT("C/A rows not equal") IF (.NOT. array_equality(matrix_c%col_blk_offset, matrix_right%col_blk_offset)) & DBCSR_ABORT("C/B columns not equal") IF (.NOT. array_equality(matrix_left%col_blk_offset, matrix_right%row_blk_offset)) & DBCSR_ABORT("A cols/B rows not equal") ! ! No dense multiplication when filtering is used. use_dense_mult = dbcsr_cfg%mm_dense%val .AND. (.NOT. PRESENT(filter_eps)) ! mp_obj = dbcsr_distribution_mp(matrix_c%dist) numnodes = dbcsr_mp_numnodes(mp_obj) nprows = dbcsr_mp_nprows(mp_obj) npcols = dbcsr_mp_npcols(mp_obj) ! ! 3D layers CALL make_layers_3D_C_reduction(dbcsr_cfg%num_layers_3D%val, mp_obj) ! ! No dense multiplication when RMA is used. IF (dbcsr_cfg%use_mpi_rma%val) THEN use_dense_mult = .FALSE. END IF ! we skip dense multiply for (anti)symmetric matrices (slowdown for S/H * C) IF (use_dense_mult) THEN IF (dbcsr_has_symmetry(matrix_left) .OR. & dbcsr_has_symmetry(matrix_right)) THEN use_dense_mult = .FALSE. ELSE use_dense_mult = dbcsr_may_be_dense(matrix_left, make_dense_occ_thresh) & .AND. dbcsr_may_be_dense(matrix_right, make_dense_occ_thresh) END IF END IF ab_dense = use_dense_mult ! Use memory pools when no dense IF (.NOT. use_acc()) THEN CALL dbcsr_memtype_setup(memtype_abpanel_1, has_pool=.NOT. ab_dense .AND. use_mempools, mpi=.TRUE.) CALL dbcsr_memtype_setup(memtype_abpanel_2, has_pool=.NOT. ab_dense .AND. use_mempools, mpi=.TRUE.) END IF ! ! Submatrix selection f_row = 1 l_row = dbcsr_nfullrows_total(matrix_c) f_col = 1 l_col = dbcsr_nfullcols_total(matrix_c) f_k = 1 l_k = dbcsr_nfullcols_total(matrix_left) IF (PRESENT(first_row)) THEN IF (first_row .LT. 1 .OR. first_row .GT. dbcsr_nfullrows_total(matrix_c)) & DBCSR_ABORT("Invalid first row specified") f_row = first_row END IF IF (PRESENT(last_row)) THEN IF (last_row .GT. dbcsr_nfullrows_total(matrix_c)) & DBCSR_ABORT("Invalid last row specified") l_row = last_row END IF IF (PRESENT(first_column)) THEN IF (first_column .LT. 1 .OR. first_column .GT. dbcsr_nfullcols_total(matrix_c)) & DBCSR_ABORT("Invalid first col specified") f_col = first_column END IF IF (PRESENT(last_column)) THEN IF (last_column .GT. dbcsr_nfullcols_total(matrix_c)) & DBCSR_ABORT("Invalid last column specified (C)") IF (last_column .GT. dbcsr_nfullcols_total(matrix_right)) & DBCSR_ABORT("Invalid last column specified (B)") l_col = last_column END IF IF (PRESENT(first_k)) THEN IF (first_k .LT. 1 .OR. first_k .GT. dbcsr_nfullcols_total(matrix_left)) & DBCSR_ABORT("Invalid first k specified (A)") f_k = first_k END IF IF (PRESENT(last_k)) THEN IF (last_k .GT. dbcsr_nfullcols_total(matrix_left)) & DBCSR_ABORT("Invalid last k specified (A)") l_k = last_k END IF ! ! update statistics (we count marketing flops per MPI rank) dbcsr_mpi_statistics%last_mpi_ranks_used = numnodes marketing_flops = marketing_flops + & (2.0*(l_row - f_row + 1.0)*(l_col - f_col + 1.0)/numnodes)*(l_k - f_k + 1.0) ! ! Now optimize the default submatrix selection values away IF (f_row .EQ. 1) f_row = 0 IF (l_row .EQ. dbcsr_nfullrows_total(matrix_left)) l_row = 0 IF (f_col .EQ. 1) f_col = 0 ! The last column must be set if the right and product matrices ! differ. l_col = MIN(l_col, dbcsr_nfullcols_total(matrix_right)) l_col = MIN(l_col, dbcsr_nfullcols_total(matrix_c)) IF (f_col .LE. 1 .AND. & l_col .EQ. dbcsr_nfullcols_total(matrix_right) .AND. & dbcsr_nfullcols_total(matrix_right) .EQ. & dbcsr_nfullcols_total(matrix_c)) l_col = 0 IF (f_k .EQ. 1) f_k = 0 IF (l_k .EQ. dbcsr_nfullcols_total(matrix_left)) l_k = 0 IF (.NOT. PRESENT(last_column) .AND. & .NOT. array_equality(matrix_right%col_blk_size, & matrix_c%col_blk_size)) THEN l_col = MIN(dbcsr_nfullcols_total(matrix_right), & dbcsr_nfullcols_total(matrix_c)) END IF IF (f_row .GT. l_row .AND. l_row .GT. 0) & DBCSR_ABORT("Last row smaller than first row") IF (f_col .GT. l_col .AND. l_col .GT. 0) & DBCSR_ABORT("Last col smaller than first col") ! ! Product data needs to be retained when ! * beta != 0; or ! * there is column limiting (l_col > 0) and the limiting column ! is less than the number of full columns in the product matrix keep_sparsity = .FALSE. IF (PRESENT(retain_sparsity)) keep_sparsity = retain_sparsity ! keep_product_data = keep_sparsity & .OR. .NOT. dbcsr_scalar_are_equal(beta, dbcsr_scalar_zero(beta%data_type)) & .OR. (l_col .GT. 0 .AND. l_col .LT. dbcsr_nfullcols_total(matrix_c)) & .OR. (l_row .GT. 0 .AND. l_row .LT. dbcsr_nfullrows_total(matrix_c)) ! IF (.NOT. dbcsr_scalar_are_equal(beta, dbcsr_scalar_one(beta%data_type)) .AND. keep_product_data) THEN CALL dbcsr_scale(matrix_c, alpha_scalar=beta, & limits=(/f_row, l_row, f_col, l_col/)) END IF ! ! The index of the product matrix is twiddled into canonical form ! if it is (anti)symmetric (i.e., rows and columns are where the ! row/column distributions say they are). Doing this in advance ! makes the local multiply more efficient. IF (dbcsr_has_symmetry(matrix_c)) THEN product_reindex = .TRUE. ELSE product_reindex = .FALSE. END IF ! Product can not be made dense; however, A & B may still be made ! dense unless previously determined otherwise. IF (product_reindex .OR. keep_sparsity) THEN use_dense_mult = .FALSE. END IF ! ! The thread distribution must reflect the current (possibly ! dense) distribution thread_dist_force = .FALSE. IF (.NOT. dbcsr_distribution_has_threads(matrix_c%dist)) THEN release_tdist = .TRUE. CALL dbcsr_distribution_make_threads(matrix_c%dist) ELSE release_tdist = .FALSE. ! Make sure matrix_c thread dist == matrix_left thread dist ! This is currently a workaround IF (dbcsr_distribution_has_threads(matrix_left%dist)) THEN matrix_c_thread_dist = matrix_c%dist%d%thread_dist matrix_c%dist%d%thread_dist = matrix_left%dist%d%thread_dist CALL array_hold(matrix_left%dist%d%thread_dist) thread_dist_force = .TRUE. END IF END IF ! ! Compute number of images (rows and columns) nimages_left_rows = dbcsr_mp_nprows(dbcsr_distribution_mp(matrix_left%dist)) nimages_match = dbcsr_distribution_get_num_images_1d( & dbcsr_nfullcols_total(matrix_left), & dbcsr_nblkcols_total(matrix_left), & dbcsr_mp_nprows(dbcsr_distribution_mp(matrix_left%dist)), & dbcsr_mp_npcols(dbcsr_distribution_mp(matrix_left%dist))) nimages_right_cols = dbcsr_mp_npcols(dbcsr_distribution_mp(matrix_right%dist)) ! ! Create imaged distributions for the multiply. CALL dbcsr_create_image_dist(rdist_right, matrix_right%dist, & match_row_nbins=dbcsr_mp_npcols(dbcsr_distribution_mp(matrix_left%dist)), & match_col_nbins=npcols, & match_col_pdist=dbcsr_distribution_col_dist(matrix_c%dist), & nimages_rows=nimages_match, & nimages_cols=nimages_right_cols) ! CALL dbcsr_create_image_dist(rdist_left, matrix_left%dist, & match_row_pdist=dbcsr_distribution_row_dist(matrix_c%dist), & match_row_nbins=nprows, & match_col_pdist=dbcsr_distribution_row_dist(rdist_right%i%main), & match_col_idist=array_data(rdist_right%i%row_image), & match_col_nbins=dbcsr_mp_nprows(dbcsr_distribution_mp(matrix_right%dist)), & nimages_rows=nimages_left_rows, & nimages_cols=nimages_match) ! IF (ab_dense) THEN CALL dbcsr_make_dists_dense(dbcsr_distribution(matrix_c), & rdist_left, rdist_right, dense_product_distribution, & dense_rdist_left, dense_rdist_right,.NOT. use_dense_mult, & m_map, k_vmap, n_map, matrix_c%row_blk_size) CALL make_sizes_dense(matrix_c%row_blk_size, m_map, & dbcsr_distribution_nrows(dense_product_distribution), & dense_row_sizes) CALL make_sizes_dense(matrix_c%col_blk_size, n_map, & dbcsr_distribution_ncols(dense_product_distribution), & dense_col_sizes) CALL make_sizes_dense(matrix_right%row_blk_size, k_vmap, & dbcsr_distribution_nrows(dense_rdist_right%i%main), & dense_k_sizes) END IF ! IF (use_dense_mult .AND. .NOT. ab_dense) & DBCSR_ABORT("Wrong logic when making dense matrices.") IF (use_dense_mult) THEN old_product_row_blk_offsets = matrix_c%row_blk_offset old_product_col_blk_offsets = matrix_c%col_blk_offset old_product_row_blk_sizes = matrix_c%row_blk_size old_product_col_blk_sizes = matrix_c%col_blk_size CALL array_hold(old_product_row_blk_offsets) CALL array_hold(old_product_col_blk_offsets) CALL array_hold(old_product_row_blk_sizes) CALL array_hold(old_product_col_blk_sizes) old_product_distribution = dbcsr_distribution(matrix_c) CALL dbcsr_distribution_hold(old_product_distribution) product_matrix = dbcsr_type() CALL dbcsr_make_dense(matrix_c, product_matrix, & dense_product_distribution, & dense_row_sizes, dense_col_sizes, & m_map, n_map) ELSE product_matrix = dbcsr_type() CALL dbcsr_copy(product_matrix, matrix_c, shallow_data=.TRUE.) END IF IF (ab_dense) THEN CALL dbcsr_distribution_release(dense_product_distribution) END IF ! ! This is needed to build the hash tables because they are ! locally indexed. CALL dbcsr_reset_locals(product_matrix) ! IF (debug_mod) THEN WRITE (*, *) routineN//" Matrices ", dbcsr_get_matrix_type(matrix_a), & dbcsr_get_matrix_type(matrix_b), dbcsr_get_matrix_type(matrix_c) WRITE (*, *) routineN//" Matrices ", transa_l, transb_l, "keep", keep_product_data END IF IF (keep_product_data) THEN IF (product_reindex) THEN IF (debug_mod) WRITE (*, *) routineN//" Making canonical index" CALL dbcsr_make_index_canonical(product_matrix) END IF IF (ASSOCIATED(product_matrix%wms)) & DBCSR_ABORT("Product matrix should be finalized!") CALL dbcsr_make_untransposed_blocks(product_matrix) !$OMP PARALLEL & !$OMP DEFAULT (NONE) SHARED (product_matrix) ! For the multiply logic to work correctly, existing data must ! be added only after the index has been transformed into the ! canonical form. CALL dbcsr_add_wm_from_matrix(product_matrix) !$OMP END PARALLEL ELSE !$OMP PARALLEL DEFAULT(NONE) PRIVATE(ithread) & !$OMP SHARED(product_matrix, memtype_product_wm) ithread = 0 !$ ithread = OMP_GET_THREAD_NUM() CALL dbcsr_work_create(product_matrix, work_mutable=.FALSE., & memory_type=memtype_product_wm(ithread)%p) !$OMP END PARALLEL END IF ! IF (dbcsr_cfg%use_mpi_rma%val) THEN ! Check for previous multiplication completeness IF (request_sync_mult .NE. mp_request_null) THEN CALL timeset(routineN//"_sync_mult", handle2) CALL mp_wait(request_sync_mult) CALL timestop(handle2) request_sync_mult = mp_request_null END IF ! ! Left buffer images CALL dbcsr_make_buffers(matrix_left, rdist_left, .TRUE., & f_row, l_row, f_k, l_k, & PRESENT(filter_eps)) ! ! Right buffer images CALL dbcsr_make_buffers(matrix_right, rdist_right, .FALSE., & f_k, l_k, f_col, l_col, & PRESENT(filter_eps), & alpha) ELSE product_matrix%nblks = 0 product_matrix%nze = 0 product_matrix%row_p(:) = 0 CALL dbcsr_data_set_size_referenced(product_matrix%data_area, 0) product_matrix%valid = .FALSE. ! ! Right images CALL make_m2s(matrix_right, m2s_right, rdist_right, dense_rdist_right, & use_dense_mult, ab_dense, "R", & f_k, l_k, f_row, l_row, f_col, l_col, & dense_k_sizes, dense_col_sizes, & k_vmap, m_map, n_map, & alpha) ! ! Left images CALL make_m2s(matrix_left, m2s_left, rdist_left, dense_rdist_left, & use_dense_mult, ab_dense, "L", & f_k, l_k, f_row, l_row, f_col, l_col, & dense_row_sizes, dense_k_sizes, & k_vmap, m_map, n_map) END IF ! IF (ab_dense) THEN CALL array_release(k_vmap) CALL array_release(dense_row_sizes) CALL array_release(dense_col_sizes) CALL array_release(dense_k_sizes) END IF ! ! The limits were already used. Reset them. f_row = 0; l_row = 0 f_col = 0; l_col = 0 f_k = 0; l_k = 0 ! my_flop = 0 IF (dbcsr_cfg%use_mpi_rma%val) THEN CALL multiply_3D(rdist_left, rdist_right, & matrix_left, matrix_right, product_matrix, & retain_sparsity=retain_sparsity, & filter_eps=filter_eps, & flop=my_flop, keep_product_data=keep_product_data) ELSE data_type = dbcsr_get_data_type(product_matrix) #if defined (__DBCSR_ACC_G2G) IF (data_type .NE. dbcsr_type_real_8 .OR. (.NOT. dbcsr_cfg%use_acc_g2g%val)) THEN ! If G2G is enabled, norms have to be calculated on the GPU. ! Since the norms kernel expects only real_8 type data, we ! avoid using G2G for all other data types CALL multiply_cannon(m2s_left, m2s_right, product_matrix, & retain_sparsity=retain_sparsity, & filter_eps=filter_eps, & flop=my_flop, keep_product_data=keep_product_data) ELSE CALL multiply_cannon_g2g(m2s_left, m2s_right, product_matrix, & retain_sparsity=retain_sparsity, & filter_eps=filter_eps, & flop=my_flop, keep_product_data=keep_product_data) END IF #else CALL multiply_cannon(m2s_left, m2s_right, product_matrix, & retain_sparsity=retain_sparsity, & filter_eps=filter_eps, & flop=my_flop, keep_product_data=keep_product_data) #endif CALL dbcsr_finalize(product_matrix, reshuffle=PRESENT(filter_eps) .AND. .NOT. keep_sparsity) END IF ! ! RMA implementation algorithm has to synchronize at the end of each multiplication comm = dbcsr_mp_group(dbcsr_distribution_mp(dbcsr_distribution(matrix_c))) IF (PRESENT(flop)) THEN ! return the average number of flops per MPI rank. ! Variance (which is fairly large) could be computed as well. CALL timeset(routineN//"_mpsum_flop", handle2) numnodes = dbcsr_mp_numnodes(dbcsr_distribution_mp(dbcsr_distribution(matrix_c))) CALL mp_sum(my_flop, comm) IF (PRESENT(flop)) THEN flop = (my_flop + numnodes - 1)/numnodes END IF CALL timestop(handle2) ELSEIF (dbcsr_cfg%use_mpi_rma%val) THEN CALL mp_isync(comm, request_sync_mult) END IF ! IF (release_tdist) THEN CALL dbcsr_distribution_no_threads(product_matrix%dist) ELSEIF (thread_dist_force) THEN ! Restore matrix_c thread-dist matrix_c%dist%d%thread_dist = matrix_c_thread_dist CALL array_release(matrix_left%dist%d%thread_dist) END IF IF (transpose_left) CALL dbcsr_release(matrix_left) IF (transpose_right) CALL dbcsr_release(matrix_right) ! CALL dbcsr_release_locals(product_matrix) ! The index of the product matrix is reset to the CP2K form if it ! was previously set to the canonical form. IF (product_reindex) THEN IF (debug_mod) WRITE (*, *) routineN//" Making CP2K index" CALL dbcsr_make_index_canonical(product_matrix, cp2k=.TRUE.) END IF IF (use_dense_mult) THEN CALL dbcsr_release(matrix_c) matrix_c = dbcsr_type() CALL dbcsr_make_undense(product_matrix, matrix_c, & old_product_distribution, & old_product_row_blk_offsets, old_product_col_blk_offsets, & old_product_row_blk_sizes, old_product_col_blk_sizes, & m_map, n_map) CALL dbcsr_release(product_matrix) CALL array_release(old_product_row_blk_offsets) CALL array_release(old_product_col_blk_offsets) CALL array_release(old_product_row_blk_sizes) CALL array_release(old_product_col_blk_sizes) CALL dbcsr_distribution_release(old_product_distribution) ELSE CALL dbcsr_release(matrix_c) matrix_c = dbcsr_type() CALL dbcsr_copy(matrix_c, product_matrix, shallow_data=.TRUE.) CALL dbcsr_release(product_matrix) END IF ! IF (.NOT. dbcsr_cfg%use_mpi_rma%val) THEN CALL dbcsr_destroy_array(m2s_left) DEALLOCATE (m2s_left) CALL dbcsr_destroy_array(m2s_right) DEALLOCATE (m2s_right) END IF ! CALL dbcsr_image_dist_release(rdist_left) CALL dbcsr_image_dist_release(rdist_right) IF (ab_dense) THEN CALL array_release(m_map) CALL array_release(n_map) END IF ! ! To support the canonical multiply (all non-transposed blocks), ! blocks may have to be transposed according to the CP2K ! triangular index. CALL dbcsr_make_untransposed_blocks(matrix_c) ! IF (debug_mod .OR. careful_mod) THEN IF (debug_mod) & WRITE (*, *) routineN//" Use dense mult, symm", & use_dense_mult, ab_dense, dbcsr_has_symmetry(matrix_c) CALL dbcsr_verify_matrix(matrix_c) IF (debug_mod) THEN cs = dbcsr_checksum(matrix_c) WRITE (*, *) routineN//" Multiplication", & num_multiplications, " Product checksum", cs END IF END IF ! This tends to trigger only when all of these conditions are fulfilled: ! - transa=="T" ! - matrix_c contains already blocks and beta is not zero ! - GPU-acceleration is enabled ! - multiple OpenMP threads are used IF (INT(matrix_c%nblks, KIND=int_8) > & INT(SIZE(array_data(matrix_c%row_blk_size)), KIND=int_8)* & INT(SIZE(array_data(matrix_c%col_blk_size)), KIND=int_8)) & DBCSR_ABORT("Bug: Matrix contains too many blocks") output_unit = default_output_unit num_multiplications = num_multiplications + 1 CALL timestop(handle) END SUBROUTINE dbcsr_multiply_generic