write out a stack for transposing the blocks
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
type(dbcsr_type), | intent(in) | :: | matrix | |||
type(dbcsr_data_obj), | intent(inout) | :: | trs_stackbuf | |||
integer, | intent(in), | DIMENSION(:), POINTER, CONTIGUOUS | :: | row_blk_sizes | ||
integer, | intent(in), | DIMENSION(:), POINTER, CONTIGUOUS | :: | col_blk_sizes | ||
integer, | intent(in), | DIMENSION(:), POINTER, CONTIGUOUS | :: | row_blk_sizes2enum | ||
integer, | intent(in), | DIMENSION(:), POINTER, CONTIGUOUS | :: | enum2row_blk_sizes | ||
integer, | intent(in), | DIMENSION(:), POINTER, CONTIGUOUS | :: | col_blk_sizes2enum | ||
integer, | intent(in), | DIMENSION(:), POINTER, CONTIGUOUS | :: | enum2col_blk_sizes | ||
logical, | intent(in), | optional | :: | noresize |
SUBROUTINE acc_transpose_blocks(matrix, trs_stackbuf, &
!! write out a stack for transposing the blocks
row_blk_sizes, col_blk_sizes, &
row_blk_sizes2enum, enum2row_blk_sizes, &
col_blk_sizes2enum, enum2col_blk_sizes, &
noresize)
TYPE(dbcsr_type), INTENT(IN) :: matrix
TYPE(dbcsr_data_obj), INTENT(INOUT) :: trs_stackbuf
INTEGER, DIMENSION(:), INTENT(IN), POINTER, CONTIGUOUS :: row_blk_sizes, col_blk_sizes, &
row_blk_sizes2enum, enum2row_blk_sizes, &
col_blk_sizes2enum, enum2col_blk_sizes
LOGICAL, INTENT(IN), OPTIONAL :: noresize
CHARACTER(len=*), PARAMETER :: routineN = 'acc_transpose_blocks'
INTEGER :: blk_p, handle, handle1, i, m, mi, &
mi_max, n, nblks, ni, ni_max, offset, x, &
row, col
INTEGER, ALLOCATABLE, DIMENSION(:, :) :: counters, filled, offsets, tmp_stack
INTEGER, DIMENSION(:), POINTER :: blk_index
INTEGER, DIMENSION(:), POINTER, CONTIGUOUS :: trs_stack
LOGICAL :: my_noresize
CALL timeset(routineN, handle)
NULLIFY (trs_stack)
IF (.NOT. matrix%list_indexing) &
DBCSR_ABORT("build_trs_stack: only list_indexing supported.")
IF (.NOT. matrix%local_indexing) &
DBCSR_ABORT("build_trs_stack: only local_indexing supported.")
IF (trs_stackbuf%d%data_type /= dbcsr_type_int_4) &
DBCSR_ABORT("build_trs_stack: stac_buf has wrong datatype")
blk_index => matrix%coo_l
nblks = matrix%nblks
! make sure buffer from previous cannon-tick was uploaded
CALL timeset(routineN//"_sync", handle1)
CALL acc_event_synchronize(trs_stackbuf%d%acc_ready)
CALL timestop(handle1)
CALL timeset(routineN//"_ensure", handle1)
my_noresize = .FALSE.
IF (PRESENT(noresize)) my_noresize = noresize
IF (my_noresize) THEN
IF (dbcsr_data_get_size(trs_stackbuf) .LT. nblks) &
DBCSR_ABORT("build_trs_stack: trs_stackbuf undersized")
ELSE
CALL dbcsr_data_ensure_size(trs_stackbuf, data_size=nblks, nocopy=.TRUE.)
END IF
CALL dbcsr_data_set_size_referenced(trs_stackbuf, nblks)
trs_stack => trs_stackbuf%d%i4
CALL timestop(handle1)
mi_max = SIZE(enum2row_blk_sizes); ni_max = SIZE(enum2col_blk_sizes)
ALLOCATE (counters(mi_max, ni_max), offsets(mi_max, ni_max))
counters(:, :) = 0; offsets(:, :) = 0
CALL timeset(routineN//"_comp", handle1)
! Simplified algorithm for single size blocks
IF (mi_max .EQ. 1 .AND. ni_max .EQ. 1) THEN
DO i = 1, nblks
blk_p = blk_index(3*(i - 1) + 3)
IF (blk_p == 0) CYCLE
counters(1, 1) = counters(1, 1) + 1
trs_stack(counters(1, 1)) = blk_p - 1
END DO
ELSE
ALLOCATE (tmp_stack(3, nblks))
! collect block addresses and dimensions in a temporary stack
! while doing so, also count number of blocks per block-dimensions
DO i = 1, nblks
row = blk_index(3*(i - 1) + 1)
col = blk_index(3*(i - 1) + 2)
blk_p = blk_index(3*(i - 1) + 3)
IF (blk_p == 0) CYCLE
m = row_blk_sizes(row)
n = col_blk_sizes(col)
mi = row_blk_sizes2enum(m)
ni = col_blk_sizes2enum(n)
tmp_stack(1, i) = mi
tmp_stack(2, i) = ni
tmp_stack(3, i) = blk_p - 1
counters(mi, ni) = counters(mi, ni) + 1
END DO
! calculate offsets for first element of each sub-stack
offset = 0
DO mi = 1, mi_max
DO ni = 1, ni_max
offsets(mi, ni) = offset
offset = offset + counters(mi, ni)
END DO
END DO
! write all sub-stacks into the host-pinned buffer
ALLOCATE (filled(mi_max, ni_max))
filled(:, :) = 0
DO i = 1, nblks
mi = tmp_stack(1, i)
ni = tmp_stack(2, i)
blk_p = tmp_stack(3, i)
x = offsets(mi, ni) + filled(mi, ni) + 1
trs_stack(x) = blk_p
filled(mi, ni) = filled(mi, ni) + 1
END DO
!sanity check
DO ni = 1, ni_max
DO mi = 1, mi_max
IF (filled(mi, ni) /= counters(mi, ni)) &
DBCSR_ABORT("acc_transpose_blocks: bug")
END DO
END DO
END IF
CALL timestop(handle1)
CALL timeset(routineN//"_sync", handle1)
!transfer all stacks
CALL dbcsr_data_host2dev(trs_stackbuf)
! make sure block-buffer is uploaded before running the kernels
CALL acc_stream_wait_event(trs_stackbuf%d%memory_type%acc_stream, matrix%data_area%d%acc_ready)
CALL timestop(handle1)
CALL timeset(routineN//"_kernels", handle1)
! launch kernels
DO ni = 1, ni_max
DO mi = 1, mi_max
IF (counters(mi, ni) > 0) THEN
m = enum2row_blk_sizes(mi)
n = enum2col_blk_sizes(ni)
CALL dbcsr_acc_transpose( &
trs_stack=trs_stackbuf%d%acc_devmem, &
offset=offsets(mi, ni), &
nblks=counters(mi, ni), &
data_type=matrix%data_type, &
buffer=matrix%data_area%d%acc_devmem, &
m=m, n=n, &
stream=trs_stackbuf%d%memory_type%acc_stream)
END IF
END DO
END DO
CALL timestop(handle1)
CALL timeset(routineN//"_sync", handle1)
! make sure block-buffer are not used until transpose kernels finished
CALL acc_event_record(trs_stackbuf%d%acc_ready, trs_stackbuf%d%memory_type%acc_stream)
CALL acc_stream_wait_event(matrix%data_area%d%memory_type%acc_stream, trs_stackbuf%d%acc_ready)
CALL acc_event_record(matrix%data_area%d%acc_ready, matrix%data_area%d%memory_type%acc_stream)
CALL timestop(handle1)
CALL timestop(handle)
END SUBROUTINE acc_transpose_blocks