write out a stack for transposing the blocks
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
type(dbcsr_type), | intent(in) | :: | matrix | |||
type(dbcsr_data_obj), | intent(inout) | :: | trs_stackbuf | |||
integer, | intent(in), | DIMENSION(:), POINTER, CONTIGUOUS | :: | row_blk_sizes | ||
integer, | intent(in), | DIMENSION(:), POINTER, CONTIGUOUS | :: | col_blk_sizes | ||
integer, | intent(in), | DIMENSION(:), POINTER, CONTIGUOUS | :: | row_blk_sizes2enum | ||
integer, | intent(in), | DIMENSION(:), POINTER, CONTIGUOUS | :: | enum2row_blk_sizes | ||
integer, | intent(in), | DIMENSION(:), POINTER, CONTIGUOUS | :: | col_blk_sizes2enum | ||
integer, | intent(in), | DIMENSION(:), POINTER, CONTIGUOUS | :: | enum2col_blk_sizes | ||
logical, | intent(in), | optional | :: | noresize |
SUBROUTINE acc_transpose_blocks(matrix, trs_stackbuf, & !! write out a stack for transposing the blocks row_blk_sizes, col_blk_sizes, & row_blk_sizes2enum, enum2row_blk_sizes, & col_blk_sizes2enum, enum2col_blk_sizes, & noresize) TYPE(dbcsr_type), INTENT(IN) :: matrix TYPE(dbcsr_data_obj), INTENT(INOUT) :: trs_stackbuf INTEGER, DIMENSION(:), INTENT(IN), POINTER, CONTIGUOUS :: row_blk_sizes, col_blk_sizes, & row_blk_sizes2enum, enum2row_blk_sizes, & col_blk_sizes2enum, enum2col_blk_sizes LOGICAL, INTENT(IN), OPTIONAL :: noresize CHARACTER(len=*), PARAMETER :: routineN = 'acc_transpose_blocks' INTEGER :: blk_p, handle, handle1, i, m, mi, & mi_max, n, nblks, ni, ni_max, offset, x, & row, col INTEGER, ALLOCATABLE, DIMENSION(:, :) :: counters, filled, offsets, tmp_stack INTEGER, DIMENSION(:), POINTER :: blk_index INTEGER, DIMENSION(:), POINTER, CONTIGUOUS :: trs_stack LOGICAL :: my_noresize CALL timeset(routineN, handle) NULLIFY (trs_stack) IF (.NOT. matrix%list_indexing) & DBCSR_ABORT("build_trs_stack: only list_indexing supported.") IF (.NOT. matrix%local_indexing) & DBCSR_ABORT("build_trs_stack: only local_indexing supported.") IF (trs_stackbuf%d%data_type /= dbcsr_type_int_4) & DBCSR_ABORT("build_trs_stack: stac_buf has wrong datatype") blk_index => matrix%coo_l nblks = matrix%nblks ! make sure buffer from previous cannon-tick was uploaded CALL timeset(routineN//"_sync", handle1) CALL acc_event_synchronize(trs_stackbuf%d%acc_ready) CALL timestop(handle1) CALL timeset(routineN//"_ensure", handle1) my_noresize = .FALSE. IF (PRESENT(noresize)) my_noresize = noresize IF (my_noresize) THEN IF (dbcsr_data_get_size(trs_stackbuf) .LT. nblks) & DBCSR_ABORT("build_trs_stack: trs_stackbuf undersized") ELSE CALL dbcsr_data_ensure_size(trs_stackbuf, data_size=nblks, nocopy=.TRUE.) END IF CALL dbcsr_data_set_size_referenced(trs_stackbuf, nblks) trs_stack => trs_stackbuf%d%i4 CALL timestop(handle1) mi_max = SIZE(enum2row_blk_sizes); ni_max = SIZE(enum2col_blk_sizes) ALLOCATE (counters(mi_max, ni_max), offsets(mi_max, ni_max)) counters(:, :) = 0; offsets(:, :) = 0 CALL timeset(routineN//"_comp", handle1) ! Simplified algorithm for single size blocks IF (mi_max .EQ. 1 .AND. ni_max .EQ. 1) THEN DO i = 1, nblks blk_p = blk_index(3*(i - 1) + 3) IF (blk_p == 0) CYCLE counters(1, 1) = counters(1, 1) + 1 trs_stack(counters(1, 1)) = blk_p - 1 END DO ELSE ALLOCATE (tmp_stack(3, nblks)) ! collect block addresses and dimensions in a temporary stack ! while doing so, also count number of blocks per block-dimensions DO i = 1, nblks row = blk_index(3*(i - 1) + 1) col = blk_index(3*(i - 1) + 2) blk_p = blk_index(3*(i - 1) + 3) IF (blk_p == 0) CYCLE m = row_blk_sizes(row) n = col_blk_sizes(col) mi = row_blk_sizes2enum(m) ni = col_blk_sizes2enum(n) tmp_stack(1, i) = mi tmp_stack(2, i) = ni tmp_stack(3, i) = blk_p - 1 counters(mi, ni) = counters(mi, ni) + 1 END DO ! calculate offsets for first element of each sub-stack offset = 0 DO mi = 1, mi_max DO ni = 1, ni_max offsets(mi, ni) = offset offset = offset + counters(mi, ni) END DO END DO ! write all sub-stacks into the host-pinned buffer ALLOCATE (filled(mi_max, ni_max)) filled(:, :) = 0 DO i = 1, nblks mi = tmp_stack(1, i) ni = tmp_stack(2, i) blk_p = tmp_stack(3, i) x = offsets(mi, ni) + filled(mi, ni) + 1 trs_stack(x) = blk_p filled(mi, ni) = filled(mi, ni) + 1 END DO !sanity check DO ni = 1, ni_max DO mi = 1, mi_max IF (filled(mi, ni) /= counters(mi, ni)) & DBCSR_ABORT("acc_transpose_blocks: bug") END DO END DO END IF CALL timestop(handle1) CALL timeset(routineN//"_sync", handle1) !transfer all stacks CALL dbcsr_data_host2dev(trs_stackbuf) ! make sure block-buffer is uploaded before running the kernels CALL acc_stream_wait_event(trs_stackbuf%d%memory_type%acc_stream, matrix%data_area%d%acc_ready) CALL timestop(handle1) CALL timeset(routineN//"_kernels", handle1) ! launch kernels DO ni = 1, ni_max DO mi = 1, mi_max IF (counters(mi, ni) > 0) THEN m = enum2row_blk_sizes(mi) n = enum2col_blk_sizes(ni) CALL dbcsr_acc_transpose( & trs_stack=trs_stackbuf%d%acc_devmem, & offset=offsets(mi, ni), & nblks=counters(mi, ni), & data_type=matrix%data_type, & buffer=matrix%data_area%d%acc_devmem, & m=m, n=n, & stream=trs_stackbuf%d%memory_type%acc_stream) END IF END DO END DO CALL timestop(handle1) CALL timeset(routineN//"_sync", handle1) ! make sure block-buffer are not used until transpose kernels finished CALL acc_event_record(trs_stackbuf%d%acc_ready, trs_stackbuf%d%memory_type%acc_stream) CALL acc_stream_wait_event(matrix%data_area%d%memory_type%acc_stream, trs_stackbuf%d%acc_ready) CALL acc_event_record(matrix%data_area%d%acc_ready, matrix%data_area%d%memory_type%acc_stream) CALL timestop(handle1) CALL timestop(handle) END SUBROUTINE acc_transpose_blocks