gdma: set transfer ability

2021-06-23 14:10:07 +08:00
parent d31b1f79e6
commit d9819bc7ae
10 changed files with 364 additions and 100 deletions
--- a/components/driver/gdma.c
+++ b/components/driver/gdma.c
@@ -74,6 +74,8 @@ struct gdma_channel_t {
    intr_handle_t intr; // per-channel interrupt handle
    gdma_channel_direction_t direction; // channel direction
    int periph_id; // Peripheral instance ID, indicates which peripheral is connected to this GDMA channel
+    size_t sram_alignment;  // alignment for memory in SRAM
+    size_t psram_alignment; // alignment for memory in PSRAM
    esp_err_t (*del)(gdma_channel_t *channel); // channel deletion function, it's polymorphic, see `gdma_del_tx_channel` or `gdma_del_rx_channel`
 };

@@ -271,6 +273,67 @@ err:
    return ret;
 }

+esp_err_t gdma_set_transfer_ability(gdma_channel_handle_t dma_chan, const gdma_transfer_ability_t *ability)
+{
+    esp_err_t ret = ESP_OK;
+    gdma_pair_t *pair = NULL;
+    gdma_group_t *group = NULL;
+    bool en_burst = true;
+    ESP_GOTO_ON_FALSE(dma_chan, ESP_ERR_INVALID_ARG, err, TAG, "invalid argument");
+    pair = dma_chan->pair;
+    group = pair->group;
+    size_t sram_alignment = ability->sram_trans_align;
+    size_t psram_alignment = ability->psram_trans_align;
+    // alignment should be 2^n
+    ESP_GOTO_ON_FALSE((sram_alignment & (sram_alignment - 1)) == 0, ESP_ERR_INVALID_ARG, err, TAG, "invalid sram alignment: %zu", sram_alignment);
+
+#if SOC_GDMA_SUPPORT_PSRAM
+    int block_size_index = 0;
+    switch (psram_alignment) {
+    case 64: // 64 Bytes alignment
+        block_size_index = GDMA_LL_EXT_MEM_BK_SIZE_64B;
+        break;
+    case 32: // 32 Bytes alignment
+        block_size_index = GDMA_LL_EXT_MEM_BK_SIZE_32B;
+        break;
+    case 16: // 16 Bytes alignment
+        block_size_index = GDMA_LL_EXT_MEM_BK_SIZE_16B;
+        break;
+    case 0: // no alignment is requirement
+        block_size_index = GDMA_LL_EXT_MEM_BK_SIZE_16B;
+        psram_alignment = SOC_GDMA_PSRAM_MIN_ALIGN; // fall back to minimal alignment
+        break;
+    default:
+        ESP_GOTO_ON_FALSE(false, ESP_ERR_INVALID_ARG, err, TAG, "invalid psram alignment: %zu", psram_alignment);
+        break;
+    }
+#endif // #if SOC_GDMA_SUPPORT_PSRAM
+
+    if (dma_chan->direction == GDMA_CHANNEL_DIRECTION_TX) {
+        // TX channel can always enable burst mode, no matter data alignment
+        gdma_ll_tx_enable_data_burst(group->hal.dev, pair->pair_id, true);
+        gdma_ll_tx_enable_descriptor_burst(group->hal.dev, pair->pair_id, true);
+#if SOC_GDMA_SUPPORT_PSRAM
+        gdma_ll_tx_set_block_size_psram(group->hal.dev, pair->pair_id, block_size_index);
+#endif // #if SOC_GDMA_SUPPORT_PSRAM
+    } else {
+        // RX channel burst mode depends on specific data alignment
+        en_burst = sram_alignment >= 4;
+        gdma_ll_rx_enable_data_burst(group->hal.dev, pair->pair_id, en_burst);
+        gdma_ll_rx_enable_descriptor_burst(group->hal.dev, pair->pair_id, en_burst);
+#if SOC_GDMA_SUPPORT_PSRAM
+        gdma_ll_rx_set_block_size_psram(group->hal.dev, pair->pair_id, block_size_index);
+#endif // #if SOC_GDMA_SUPPORT_PSRAM
+    }
+
+    dma_chan->sram_alignment = sram_alignment;
+    dma_chan->psram_alignment = psram_alignment;
+    ESP_LOGD(TAG, "%s channel (%d,%d), (%zu:%zu) bytes aligned, burst %s", dma_chan->direction == GDMA_CHANNEL_DIRECTION_TX ? "tx" : "rx",
+             group->group_id, pair->pair_id, sram_alignment, psram_alignment, en_burst ? "enabled" : "disabled");
+err:
+    return ret;
+}
+
 esp_err_t gdma_apply_strategy(gdma_channel_handle_t dma_chan, const gdma_strategy_config_t *config)
 {
    esp_err_t ret = ESP_OK;
--- a/components/driver/include/esp_private/gdma.h
+++ b/components/driver/include/esp_private/gdma.h
@@ -59,10 +59,23 @@ typedef struct {
    gdma_channel_handle_t sibling_chan; /*!< DMA sibling channel handle (NULL means having sibling is not necessary) */
    gdma_channel_direction_t direction; /*!< DMA channel direction */
    struct {
-        int reserve_sibling: 1;   /*!< If set, DMA channel allocator would prefer to allocate new channel in a new pair, and reserve sibling channel for future use */
+        int reserve_sibling: 1; /*!< If set, DMA channel allocator would prefer to allocate new channel in a new pair, and reserve sibling channel for future use */
    } flags;
 } gdma_channel_alloc_config_t;

+/**
+ * @brief GDMA transfer ability
+ *
+ * @note The alignment set in this structure is **not** a guarantee that gdma driver will take care of the nonalignment cases.
+ *       Actually the GDMA driver has no knowledge about the DMA buffer (address and size) used by upper layer.
+ *       So it's the responsibility of the **upper layer** to take care of the buffer address and size.
+ *
+ */
+typedef struct {
+    size_t sram_trans_align;  /*!< DMA transfer alignment for memory in SRAM, in bytes. The driver enables/disables burst mode based on this value. 0 means no alignment is required */
+    size_t psram_trans_align; /*!< DMA transfer alignment for memory in PSRAM, in bytes. The driver sets proper burst block size based on the alignment value. 0 means no alignment is required */
+} gdma_transfer_ability_t;
+
 /**
 * @brief Type of GDMA event data
 *
@@ -80,6 +93,9 @@ typedef struct {
 * @param event_data GDMA event data
 * @param user_data User registered data from `gdma_register_tx_event_callbacks` or `gdma_register_rx_event_callbacks`
 *
+ * @return Whether a task switch is needed after the callback function returns,
+ *         this is usually due to the callback wakes up some high priority task.
+ *
 */
 typedef bool (*gdma_event_callback_t)(gdma_channel_handle_t dma_chan, gdma_event_data_t *event_data, void *user_data);

@@ -172,6 +188,18 @@ esp_err_t gdma_connect(gdma_channel_handle_t dma_chan, gdma_trigger_t trig_perip
 */
 esp_err_t gdma_disconnect(gdma_channel_handle_t dma_chan);

+/**
+ * @brief Set DMA channel transfer ability
+ *
+ * @param[in] dma_chan GDMA channel handle, allocated by `gdma_new_channel`
+ * @param[in] ability Transfer ability, e.g. alignment
+ * @return
+ *      - ESP_OK: Set DMA channel transfer ability successfully
+ *      - ESP_ERR_INVALID_ARG: Set DMA channel transfer ability failed because of invalid argument
+ *      - ESP_FAIL: Set DMA channel transfer ability failed because of other error
+ */
+esp_err_t gdma_set_transfer_ability(gdma_channel_handle_t dma_chan, const gdma_transfer_ability_t *ability);
+
 /**
 * @brief Apply channel strategy for GDMA channel
 *
--- a/components/esp_hw_support/esp_async_memcpy.c
+++ b/components/esp_hw_support/esp_async_memcpy.c
@@ -11,6 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
+#include <sys/param.h>
 #include "freertos/FreeRTOS.h"
 #include "freertos/semphr.h"
 #include "hal/dma_types.h"
@@ -22,6 +24,8 @@

 static const char *TAG = "async_memcpy";

+#define ALIGN_DOWN(val, align)  ((val) & ~((align) - 1))
+
 /**
 * @brief Type of async mcp stream
 *        mcp stream inherits DMA descriptor, besides that, it has a callback function member
@@ -43,7 +47,8 @@ typedef struct async_memcpy_context_t {
    dma_descriptor_t *tx_desc; // pointer to the next free TX descriptor
    dma_descriptor_t *rx_desc; // pointer to the next free RX descriptor
    dma_descriptor_t *next_rx_desc_to_check; // pointer to the next RX descriptor to recycle
-    uint32_t max_stream_num;            // maximum number of streams
+    uint32_t max_stream_num;    // maximum number of streams
+    size_t max_dma_buffer_size; // maximum DMA buffer size
    async_memcpy_stream_t *out_streams;    // pointer to the first TX stream
    async_memcpy_stream_t *in_streams;     // pointer to the first RX stream
    async_memcpy_stream_t streams_pool[0]; // stream pool (TX + RX), the size is configured during driver installation
@@ -82,9 +87,14 @@ esp_err_t esp_async_memcpy_install(const async_memcpy_config_t *config, async_me
    mcp_hdl->rx_desc = &mcp_hdl->in_streams[0].desc;
    mcp_hdl->next_rx_desc_to_check = &mcp_hdl->in_streams[0].desc;
    mcp_hdl->spinlock = (portMUX_TYPE)portMUX_INITIALIZER_UNLOCKED;
+    mcp_hdl->mcp_impl.sram_trans_align = config->sram_trans_align;
+    mcp_hdl->mcp_impl.psram_trans_align = config->psram_trans_align;
+    size_t trans_align = MAX(config->sram_trans_align, config->psram_trans_align);
+    mcp_hdl->max_dma_buffer_size = trans_align ? ALIGN_DOWN(DMA_DESCRIPTOR_BUFFER_MAX_SIZE, trans_align) : DMA_DESCRIPTOR_BUFFER_MAX_SIZE;

    // initialize implementation layer
-    async_memcpy_impl_init(&mcp_hdl->mcp_impl);
+    ret = async_memcpy_impl_init(&mcp_hdl->mcp_impl);
+    ESP_GOTO_ON_ERROR(ret, err, TAG, "DMA M2M init failed");

    *asmcp = mcp_hdl;

@@ -121,14 +131,14 @@ static int async_memcpy_prepare_receive(async_memcpy_t asmcp, void *buffer, size
    dma_descriptor_t *start = desc;
    dma_descriptor_t *end = desc;

-    while (size > DMA_DESCRIPTOR_BUFFER_MAX_SIZE) {
+    while (size > asmcp->max_dma_buffer_size) {
        if (desc->dw0.owner != DMA_DESCRIPTOR_BUFFER_OWNER_DMA) {
            desc->dw0.suc_eof = 0;
-            desc->dw0.size = DMA_DESCRIPTOR_BUFFER_MAX_SIZE;
+            desc->dw0.size = asmcp->max_dma_buffer_size;
            desc->buffer = &buf[prepared_length];
            desc = desc->next; // move to next descriptor
-            prepared_length += DMA_DESCRIPTOR_BUFFER_MAX_SIZE;
-            size -= DMA_DESCRIPTOR_BUFFER_MAX_SIZE;
+            prepared_length += asmcp->max_dma_buffer_size;
+            size -= asmcp->max_dma_buffer_size;
        } else {
            // out of RX descriptors
            goto _exit;
@@ -162,15 +172,15 @@ static int async_memcpy_prepare_transmit(async_memcpy_t asmcp, void *buffer, siz
    dma_descriptor_t *start = desc;
    dma_descriptor_t *end = desc;

-    while (len > DMA_DESCRIPTOR_BUFFER_MAX_SIZE) {
+    while (len > asmcp->max_dma_buffer_size) {
        if (desc->dw0.owner != DMA_DESCRIPTOR_BUFFER_OWNER_DMA) {
            desc->dw0.suc_eof = 0; // not the end of the transaction
-            desc->dw0.size = DMA_DESCRIPTOR_BUFFER_MAX_SIZE;
-            desc->dw0.length = DMA_DESCRIPTOR_BUFFER_MAX_SIZE;
+            desc->dw0.size = asmcp->max_dma_buffer_size;
+            desc->dw0.length = asmcp->max_dma_buffer_size;
            desc->buffer = &buf[prepared_length];
            desc = desc->next; // move to next descriptor
-            prepared_length += DMA_DESCRIPTOR_BUFFER_MAX_SIZE;
-            len -= DMA_DESCRIPTOR_BUFFER_MAX_SIZE;
+            prepared_length += asmcp->max_dma_buffer_size;
+            len -= asmcp->max_dma_buffer_size;
        } else {
            // out of TX descriptors
            goto _exit;
@@ -222,14 +232,20 @@ esp_err_t esp_async_memcpy(async_memcpy_t asmcp, void *dst, void *src, size_t n,
    size_t rx_prepared_size = 0;
    size_t tx_prepared_size = 0;
    ESP_GOTO_ON_FALSE(asmcp, ESP_ERR_INVALID_ARG, err, TAG, "mcp handle can't be null");
-    ESP_GOTO_ON_FALSE(async_memcpy_impl_is_buffer_address_valid(&asmcp->mcp_impl, src, dst), ESP_ERR_INVALID_ARG, err, TAG, "buffer address not valid");
-    ESP_GOTO_ON_FALSE(n <= DMA_DESCRIPTOR_BUFFER_MAX_SIZE * asmcp->max_stream_num, ESP_ERR_INVALID_ARG, err, TAG, "buffer size too large");
+    ESP_GOTO_ON_FALSE(async_memcpy_impl_is_buffer_address_valid(&asmcp->mcp_impl, src, dst), ESP_ERR_INVALID_ARG, err, TAG, "buffer address not valid: %p -> %p", src, dst);
+    ESP_GOTO_ON_FALSE(n <= asmcp->max_dma_buffer_size * asmcp->max_stream_num, ESP_ERR_INVALID_ARG, err, TAG, "buffer size too large");
+    if (asmcp->mcp_impl.sram_trans_align) {
+        ESP_GOTO_ON_FALSE(((n & (asmcp->mcp_impl.sram_trans_align - 1)) == 0), ESP_ERR_INVALID_ARG, err, TAG, "copy size should align to %d bytes", asmcp->mcp_impl.sram_trans_align);
+    }
+    if (asmcp->mcp_impl.psram_trans_align) {
+        ESP_GOTO_ON_FALSE(((n & (asmcp->mcp_impl.psram_trans_align - 1)) == 0), ESP_ERR_INVALID_ARG, err, TAG, "copy size should align to %d bytes", asmcp->mcp_impl.psram_trans_align);
+    }

    // Prepare TX and RX descriptor
    portENTER_CRITICAL_SAFE(&asmcp->spinlock);
    rx_prepared_size = async_memcpy_prepare_receive(asmcp, dst, n, &rx_start_desc, &rx_end_desc);
    tx_prepared_size = async_memcpy_prepare_transmit(asmcp, src, n, &tx_start_desc, &tx_end_desc);
-    if ((rx_prepared_size == n) && (tx_prepared_size == n)) {
+    if (rx_start_desc && tx_start_desc && (rx_prepared_size == n) && (tx_prepared_size == n)) {
        // register user callback to the last descriptor
        async_memcpy_stream_t *mcp_stream = __containerof(rx_end_desc, async_memcpy_stream_t, desc);
        mcp_stream->cb = cb_isr;
--- a/components/esp_hw_support/include/esp_async_memcpy.h
+++ b/components/esp_hw_support/include/esp_async_memcpy.h
@@ -54,8 +54,10 @@ typedef bool (*async_memcpy_isr_cb_t)(async_memcpy_t mcp_hdl, async_memcpy_event
 *
 */
 typedef struct {
-    uint32_t backlog; /*!< Maximum number of streams that can be handled simultaneously */
-    uint32_t flags;   /*!< Extra flags to control async memcpy feature */
+    uint32_t backlog;          /*!< Maximum number of streams that can be handled simultaneously */
+    size_t sram_trans_align;   /*!< DMA transfer alignment (both in size and address) for SRAM memory */
+    size_t psram_trans_align;  /*!< DMA transfer alignment (both in size and address) for PSRAM memory */
+    uint32_t flags;            /*!< Extra flags to control async memcpy feature */
 } async_memcpy_config_t;

 /**
@@ -63,9 +65,11 @@ typedef struct {
 *
 */
 #define ASYNC_MEMCPY_DEFAULT_CONFIG() \
-    {                              \
-        .backlog = 8,              \
-        .flags = 0,                \
+    {                                 \
+        .backlog = 8,                 \
+        .sram_trans_align = 0,        \
+        .psram_trans_align = 0,       \
+        .flags = 0,                   \
    }

 /**
--- a/components/esp_hw_support/port/async_memcpy_impl_gdma.c
+++ b/components/esp_hw_support/port/async_memcpy_impl_gdma.c
@@ -61,9 +61,21 @@ esp_err_t async_memcpy_impl_init(async_memcpy_impl_t *impl)

    gdma_strategy_config_t strategy_config = {
        .auto_update_desc = true,
-        .owner_check = true
+        .owner_check = true,
    };

+    gdma_transfer_ability_t transfer_ability = {
+        .sram_trans_align = impl->sram_trans_align,
+        .psram_trans_align = impl->psram_trans_align,
+    };
+    ret = gdma_set_transfer_ability(impl->tx_channel, &transfer_ability);
+    if (ret != ESP_OK) {
+        goto err;
+    }
+    ret = gdma_set_transfer_ability(impl->rx_channel, &transfer_ability);
+    if (ret != ESP_OK) {
+        goto err;
+    }
    gdma_apply_strategy(impl->tx_channel, &strategy_config);
    gdma_apply_strategy(impl->rx_channel, &strategy_config);

@@ -108,5 +120,15 @@ esp_err_t async_memcpy_impl_restart(async_memcpy_impl_t *impl)

 bool async_memcpy_impl_is_buffer_address_valid(async_memcpy_impl_t *impl, void *src, void *dst)
 {
-    return true;
+    bool valid = true;
+    if (esp_ptr_external_ram(dst)) {
+        if (impl->psram_trans_align) {
+            valid = valid && (((intptr_t)dst & (impl->psram_trans_align - 1)) == 0);
+        }
+    } else {
+        if (impl->sram_trans_align) {
+            valid = valid && (((intptr_t)dst & (impl->sram_trans_align - 1)) == 0);
+        }
+    }
+    return valid;
 }
--- a/components/esp_hw_support/port/include/esp_async_memcpy_impl.h
+++ b/components/esp_hw_support/port/include/esp_async_memcpy_impl.h
@@ -46,6 +46,8 @@ typedef struct {
    gdma_channel_handle_t rx_channel;
 #endif
    intptr_t rx_eof_addr;
+    size_t sram_trans_align;
+    size_t psram_trans_align;
    bool isr_need_yield;      // if current isr needs a yield for higher priority task
 } async_memcpy_impl_t;

--- a/components/esp_hw_support/test/test_async_memcpy.c
+++ b/components/esp_hw_support/test/test_async_memcpy.c
@@ -12,37 +12,75 @@
 #include "ccomp_timer.h"
 #include "esp_async_memcpy.h"
 #include "soc/soc_caps.h"
+#include "hal/dma_types.h"

 #if SOC_CP_DMA_SUPPORTED || SOC_GDMA_SUPPORTED

 #define ALIGN_UP(addr, align) (((addr) + (align)-1) & ~((align)-1))
+#define ALIGN_DOWN(size, align)  ((size) & ~((align) - 1))

-static void async_memcpy_setup_testbench(uint32_t seed, uint32_t *buffer_size, uint8_t **src_buf, uint8_t **dst_buf, uint8_t **from_addr, uint8_t **to_addr, uint32_t align)
+typedef struct {
+    uint32_t seed;
+    uint32_t buffer_size;
+    uint8_t *src_buf;
+    uint8_t *dst_buf;
+    uint8_t *from_addr;
+    uint8_t *to_addr;
+    uint32_t align;
+    uint32_t offset;
+    bool src_in_psram;
+    bool dst_in_psram;
+} memcpy_testbench_context_t;
+
+static void async_memcpy_setup_testbench(memcpy_testbench_context_t *test_context)
 {
-    srand(seed);
+    srand(test_context->seed);
    printf("allocating memory buffer...\r\n");
-    // memory copy from/to PSRAM is not allowed
-    *src_buf = heap_caps_malloc(*buffer_size, MALLOC_CAP_8BIT | MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL);
-    *dst_buf = heap_caps_calloc(1, *buffer_size, MALLOC_CAP_8BIT | MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL);
-
-    TEST_ASSERT_NOT_NULL_MESSAGE(*src_buf, "allocate source buffer failed");
-    TEST_ASSERT_NOT_NULL_MESSAGE(*dst_buf, "allocate destination buffer failed");
-
-    *from_addr = (uint8_t *)ALIGN_UP((uint32_t)(*src_buf), 4);
-    *to_addr = (uint8_t *)ALIGN_UP((uint32_t)(*dst_buf), 4);
-    uint8_t gap = MAX(*from_addr - *src_buf, *to_addr - *dst_buf);
-    *buffer_size -= gap;
-
-    *from_addr += align;
-    *to_addr += align;
-    *buffer_size -= align;
-
-    printf("...size %d Bytes, src@%p, dst@%p\r\n", *buffer_size, *from_addr, *to_addr);
-
-    printf("fill src buffer with random data\r\n");
-    for (int i = 0; i < *buffer_size; i++) {
-        (*from_addr)[i] = rand() % 256;
+    uint32_t buffer_size = test_context->buffer_size;
+    uint8_t *src_buf = NULL;
+    uint8_t *dst_buf = NULL;
+    uint8_t *from_addr = NULL;
+    uint8_t *to_addr = NULL;
+#if CONFIG_SPIRAM && SOC_GDMA_SUPPORT_PSRAM
+    if (test_context->src_in_psram) {
+        src_buf = heap_caps_malloc(buffer_size, MALLOC_CAP_SPIRAM);
+    } else {
+        src_buf = heap_caps_malloc(buffer_size, MALLOC_CAP_8BIT | MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL);
    }
+    if (test_context->dst_in_psram) {
+        dst_buf = heap_caps_calloc(1, buffer_size, MALLOC_CAP_SPIRAM);
+    } else {
+        dst_buf = heap_caps_calloc(1, buffer_size, MALLOC_CAP_8BIT | MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL);
+    }
+#else
+    src_buf = heap_caps_malloc(buffer_size, MALLOC_CAP_8BIT | MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL);
+    dst_buf = heap_caps_calloc(1, buffer_size, MALLOC_CAP_8BIT | MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL);
+#endif
+    TEST_ASSERT_NOT_NULL_MESSAGE(src_buf, "allocate source buffer failed");
+    TEST_ASSERT_NOT_NULL_MESSAGE(dst_buf, "allocate destination buffer failed");
+    // address alignment
+    from_addr = (uint8_t *)ALIGN_UP((uint32_t)(src_buf), test_context->align);
+    to_addr = (uint8_t *)ALIGN_UP((uint32_t)(dst_buf), test_context->align);
+    uint8_t gap = MAX(from_addr - src_buf, to_addr - dst_buf);
+    buffer_size -= gap;
+    // size alignment
+    buffer_size = ALIGN_DOWN(buffer_size, test_context->align);
+    // adding extra offset
+    from_addr += test_context->offset;
+    to_addr += test_context->offset;
+    buffer_size -= test_context->offset;
+
+    printf("...size %d Bytes, src@%p, dst@%p\r\n", buffer_size, from_addr, to_addr);
+    printf("fill src buffer with random data\r\n");
+    for (int i = 0; i < buffer_size; i++) {
+        from_addr[i] = rand() % 256;
+    }
+    // return value
+    test_context->buffer_size = buffer_size;
+    test_context->src_buf = src_buf;
+    test_context->dst_buf = dst_buf;
+    test_context->from_addr = from_addr;
+    test_context->to_addr = to_addr;
 }

 static void async_memcpy_verify_and_clear_testbench(uint32_t seed, uint32_t buffer_size, uint8_t *src_buf, uint8_t *dst_buf, uint8_t *from_addr, uint8_t *to_addr)
@@ -91,18 +129,18 @@ TEST_CASE("memory copy by DMA one by one", "[async mcp]")
    TEST_ESP_OK(esp_async_memcpy_install(&config, &driver));

    uint32_t test_buffer_len[] = {256, 512, 1024, 2048, 4096, 5011};
-    uint8_t *sbuf = NULL;
-    uint8_t *dbuf = NULL;
-    uint8_t *from = NULL;
-    uint8_t *to = NULL;
+    memcpy_testbench_context_t test_context = {
+        .align = 4,
+    };

    for (int i = 0; i < sizeof(test_buffer_len) / sizeof(test_buffer_len[0]); i++) {
        // Test different align edge
-        for (int align = 0; align < 4; align++) {
-            async_memcpy_setup_testbench(i, &test_buffer_len[i], &sbuf, &dbuf, &from, &to, align);
-            TEST_ESP_OK(esp_async_memcpy(driver, to, from, test_buffer_len[i], NULL, NULL));
-            async_memcpy_verify_and_clear_testbench(i, test_buffer_len[i], sbuf, dbuf, from, to);
-
+        for (int off = 0; off < 4; off++) {
+            test_context.buffer_size = test_buffer_len[i];
+            test_context.seed = i;
+            async_memcpy_setup_testbench(&test_context);
+            TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.buffer_size, NULL, NULL));
+            async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.buffer_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr);
            vTaskDelay(pdMS_TO_TICKS(100));
        }
    }
@@ -117,86 +155,177 @@ TEST_CASE("memory copy by DMA on the fly", "[async mcp]")
    TEST_ESP_OK(esp_async_memcpy_install(&config, &driver));

    uint32_t test_buffer_len[] = {512, 1024, 2048, 4096, 5011};
-    uint8_t *sbufs[] = {0, 0, 0, 0, 0};
-    uint8_t *dbufs[] = {0, 0, 0, 0, 0};
-    uint8_t *froms[] = {0, 0, 0, 0, 0};
-    uint8_t *tos[] = {0, 0, 0, 0, 0};
+    memcpy_testbench_context_t test_context[] = {
+        {.align = 4}, {.align = 4}, {.align = 4}, {.align = 4}, {.align = 4},
+    };

    // Aligned case
-    for (int i = 0; i < sizeof(sbufs) / sizeof(sbufs[0]); i++) {
-        async_memcpy_setup_testbench(i, &test_buffer_len[i], &sbufs[i], &dbufs[i], &froms[i], &tos[i], 0);
+    for (int i = 0; i < sizeof(test_buffer_len) / sizeof(test_buffer_len[0]); i++) {
+        test_context[i].seed = i;
+        test_context[i].buffer_size = test_buffer_len[i];
+        async_memcpy_setup_testbench(&test_context[i]);
    }
    for (int i = 0; i < sizeof(test_buffer_len) / sizeof(test_buffer_len[0]); i++) {
-        TEST_ESP_OK(esp_async_memcpy(driver, tos[i], froms[i], test_buffer_len[i], NULL, NULL));
+        TEST_ESP_OK(esp_async_memcpy(driver, test_context[i].to_addr, test_context[i].from_addr, test_context[i].buffer_size, NULL, NULL));
    }
-    for (int i = 0; i < sizeof(sbufs) / sizeof(sbufs[0]); i++) {
-        async_memcpy_verify_and_clear_testbench(i, test_buffer_len[i], sbufs[i], dbufs[i], froms[i], tos[i]);
+    for (int i = 0; i < sizeof(test_buffer_len) / sizeof(test_buffer_len[0]); i++) {
+        async_memcpy_verify_and_clear_testbench(i, test_context[i].buffer_size, test_context[i].src_buf, test_context[i].dst_buf, test_context[i].from_addr, test_context[i].to_addr);
    }

    // Non-aligned case
-    for (int i = 0; i < sizeof(sbufs) / sizeof(sbufs[0]); i++) {
-        async_memcpy_setup_testbench(i, &test_buffer_len[i], &sbufs[i], &dbufs[i], &froms[i], &tos[i], 3);
+    for (int i = 0; i < sizeof(test_buffer_len) / sizeof(test_buffer_len[0]); i++) {
+        test_context[i].seed = i;
+        test_context[i].buffer_size = test_buffer_len[i];
+        test_context[i].offset = 3;
+        async_memcpy_setup_testbench(&test_context[i]);
    }
    for (int i = 0; i < sizeof(test_buffer_len) / sizeof(test_buffer_len[0]); i++) {
-        TEST_ESP_OK(esp_async_memcpy(driver, tos[i], froms[i], test_buffer_len[i], NULL, NULL));
+        TEST_ESP_OK(esp_async_memcpy(driver, test_context[i].to_addr, test_context[i].from_addr, test_context[i].buffer_size, NULL, NULL));
    }
-    for (int i = 0; i < sizeof(sbufs) / sizeof(sbufs[0]); i++) {
-        async_memcpy_verify_and_clear_testbench(i, test_buffer_len[i], sbufs[i], dbufs[i], froms[i], tos[i]);
+    for (int i = 0; i < sizeof(test_buffer_len) / sizeof(test_buffer_len[0]); i++) {
+        async_memcpy_verify_and_clear_testbench(i, test_context[i].buffer_size, test_context[i].src_buf, test_context[i].dst_buf, test_context[i].from_addr, test_context[i].to_addr);
    }

    TEST_ESP_OK(esp_async_memcpy_uninstall(driver));
 }

-#define TEST_ASYNC_MEMCPY_BENCH_COUNTS (16)
-static uint32_t test_async_memcpy_bench_len = 4095;
-static int count = 0;
+#define TEST_ASYNC_MEMCPY_BENCH_COUNTS   (16)
+static int s_count = 0;

 static IRAM_ATTR bool test_async_memcpy_isr_cb(async_memcpy_t mcp_hdl, async_memcpy_event_t *event, void *cb_args)
 {
    SemaphoreHandle_t sem = (SemaphoreHandle_t)cb_args;
    BaseType_t high_task_wakeup = pdFALSE;
-    count++;
-    if (count == TEST_ASYNC_MEMCPY_BENCH_COUNTS) {
+    s_count++;
+    if (s_count == TEST_ASYNC_MEMCPY_BENCH_COUNTS) {
        xSemaphoreGiveFromISR(sem, &high_task_wakeup);
    }
    return high_task_wakeup == pdTRUE;
 }

-TEST_CASE("memory copy by DMA with callback", "[async mcp]")
+static void memcpy_performance_test(uint32_t buffer_size)
 {
    SemaphoreHandle_t sem = xSemaphoreCreateBinary();

    async_memcpy_config_t config = ASYNC_MEMCPY_DEFAULT_CONFIG();
-    config.backlog = TEST_ASYNC_MEMCPY_BENCH_COUNTS;
+    config.backlog = (buffer_size / DMA_DESCRIPTOR_BUFFER_MAX_SIZE + 1) * TEST_ASYNC_MEMCPY_BENCH_COUNTS;
+    config.sram_trans_align = 4;   // at least 4 bytes aligned for SRAM transfer
+    config.psram_trans_align = 64; // at least 64 bytes aligned for PSRAM transfer
    async_memcpy_t driver = NULL;
+    int64_t elapse_us = 0;
+    float throughput = 0.0;
    TEST_ESP_OK(esp_async_memcpy_install(&config, &driver));

-    uint8_t *sbuf = NULL;
-    uint8_t *dbuf = NULL;
-    uint8_t *from = NULL;
-    uint8_t *to = NULL;
-
-    async_memcpy_setup_testbench(0, &test_async_memcpy_bench_len, &sbuf, &dbuf, &from, &to, 0);
-    count = 0;
+    // 1. SRAM->SRAM
+    memcpy_testbench_context_t test_context = {
+        .align = config.psram_trans_align,
+        .buffer_size = buffer_size,
+        .src_in_psram = false,
+        .dst_in_psram = false,
+    };
+    async_memcpy_setup_testbench(&test_context);
+    s_count = 0;
    ccomp_timer_start();
    for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) {
-        TEST_ESP_OK(esp_async_memcpy(driver, to, from, test_async_memcpy_bench_len, test_async_memcpy_isr_cb, sem));
+        TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.buffer_size, test_async_memcpy_isr_cb, sem));
    }
-
    // wait for done semaphore
    TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(1000)));
-    esp_rom_printf("memcpy %d Bytes data by HW costs %lldus\r\n", test_async_memcpy_bench_len, ccomp_timer_stop() / TEST_ASYNC_MEMCPY_BENCH_COUNTS);
-
+    elapse_us = ccomp_timer_stop();
+    throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
+    IDF_LOG_PERFORMANCE("DMA_COPY", "%.2f MB/s, dir: SRAM->SRAM, size: %zu Bytes", throughput, test_context.buffer_size);
    ccomp_timer_start();
    for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) {
-        memcpy(to, from, test_async_memcpy_bench_len);
+        memcpy(test_context.to_addr, test_context.from_addr, test_context.buffer_size);
    }
-    esp_rom_printf("memcpy %d Bytes data by SW costs %lldus\r\n", test_async_memcpy_bench_len, ccomp_timer_stop() / TEST_ASYNC_MEMCPY_BENCH_COUNTS);
+    elapse_us = ccomp_timer_stop();
+    throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
+    IDF_LOG_PERFORMANCE("CPU_COPY", "%.2f MB/s, dir: SRAM->SRAM, size: %zu Bytes", throughput, test_context.buffer_size);
+    async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.buffer_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr);

-    async_memcpy_verify_and_clear_testbench(0, test_async_memcpy_bench_len, sbuf, dbuf, from, to);
+#if CONFIG_SPIRAM && SOC_GDMA_SUPPORT_PSRAM
+    // 2. PSRAM->PSRAM
+    test_context.src_in_psram = true;
+    test_context.dst_in_psram = true;
+    async_memcpy_setup_testbench(&test_context);
+    s_count = 0;
+    ccomp_timer_start();
+    for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) {
+        TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.buffer_size, test_async_memcpy_isr_cb, sem));
+    }
+    // wait for done semaphore
+    TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(1000)));
+    elapse_us = ccomp_timer_stop();
+    throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
+    IDF_LOG_PERFORMANCE("DMA_COPY", "%.2f MB/s, dir: PSRAM->PSRAM, size: %zu Bytes", throughput, test_context.buffer_size);
+    ccomp_timer_start();
+    for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) {
+        memcpy(test_context.to_addr, test_context.from_addr, test_context.buffer_size);
+    }
+    elapse_us = ccomp_timer_stop();
+    throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
+    IDF_LOG_PERFORMANCE("CPU_COPY", "%.2f MB/s, dir: PSRAM->PSRAM, size: %zu Bytes", throughput, test_context.buffer_size);
+    async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.buffer_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr);
+
+    // 3. PSRAM->SRAM
+    test_context.src_in_psram = true;
+    test_context.dst_in_psram = false;
+    async_memcpy_setup_testbench(&test_context);
+    s_count = 0;
+    ccomp_timer_start();
+    for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) {
+        TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.buffer_size, test_async_memcpy_isr_cb, sem));
+    }
+    // wait for done semaphore
+    TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(1000)));
+    elapse_us = ccomp_timer_stop();
+    throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
+    IDF_LOG_PERFORMANCE("DMA_COPY", "%.2f MB/s, dir: PSRAM->SRAM, size: %zu Bytes", throughput, test_context.buffer_size);
+    ccomp_timer_start();
+    for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) {
+        memcpy(test_context.to_addr, test_context.from_addr, test_context.buffer_size);
+    }
+    elapse_us = ccomp_timer_stop();
+    throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
+    IDF_LOG_PERFORMANCE("CPU_COPY", "%.2f MB/s, dir: PSRAM->SRAM, size: %zu Bytes", throughput, test_context.buffer_size);
+    async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.buffer_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr);
+
+    // 4. SRAM->PSRAM
+    test_context.src_in_psram = false;
+    test_context.dst_in_psram = true;
+    async_memcpy_setup_testbench(&test_context);
+    s_count = 0;
+    ccomp_timer_start();
+    for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) {
+        TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.buffer_size, test_async_memcpy_isr_cb, sem));
+    }
+    // wait for done semaphore
+    TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(1000)));
+    elapse_us = ccomp_timer_stop();
+    throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
+    IDF_LOG_PERFORMANCE("DMA_COPY", "%.2f MB/s, dir: SRAM->PSRAM, size: %zu Bytes", throughput, test_context.buffer_size);
+    ccomp_timer_start();
+    for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) {
+        memcpy(test_context.to_addr, test_context.from_addr, test_context.buffer_size);
+    }
+    elapse_us = ccomp_timer_stop();
+    throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
+    IDF_LOG_PERFORMANCE("CPU_COPY", "%.2f MB/s, dir: SRAM->PSRAM, size: %zu Bytes", throughput, test_context.buffer_size);
+    async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.buffer_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr);
+#endif

    TEST_ESP_OK(esp_async_memcpy_uninstall(driver));
    vSemaphoreDelete(sem);
 }

+TEST_CASE("memory copy performance test 40KB", "[async mcp]")
+{
+    memcpy_performance_test(40 * 1024);
+}
+
+TEST_CASE("memory copy performance test 4KB", "[async mcp]")
+{
+    memcpy_performance_test(4 * 1024);
+}
+
 #endif //SOC_CP_DMA_SUPPORTED || SOC_GDMA_SUPPORTED
--- a/components/mbedtls/port/crypto_shared_gdma/esp_crypto_shared_gdma.c
+++ b/components/mbedtls/port/crypto_shared_gdma/esp_crypto_shared_gdma.c
@@ -37,7 +37,7 @@ static inline esp_err_t crypto_shared_gdma_new_channel(gdma_channel_alloc_config
    esp_err_t ret;
    int time_waited_ms = 0;

-    while(1) {
+    while (1) {
        ret = gdma_new_channel(channel_config, channel);

        if (ret == ESP_OK) {
@@ -58,14 +58,12 @@ static inline esp_err_t crypto_shared_gdma_new_channel(gdma_channel_alloc_config
 /* Initialize external memory specific DMA configs */
 static void esp_crypto_shared_dma_init_extmem(void)
 {
-    int tx_ch_id = 0;
-    int rx_ch_id = 0;
-
-    gdma_get_channel_id(tx_channel, &tx_ch_id);
-    gdma_get_channel_id(rx_channel, &rx_ch_id);
-
-    gdma_ll_tx_set_block_size_psram(&GDMA, tx_ch_id, GDMA_LL_EXT_MEM_BK_SIZE_16B);
-    gdma_ll_rx_set_block_size_psram(&GDMA, rx_ch_id, GDMA_LL_EXT_MEM_BK_SIZE_16B);
+    gdma_transfer_ability_t transfer_ability = {
+        .sram_trans_align = 4,
+        .psram_trans_align = 16,
+    };
+    gdma_set_transfer_ability(tx_channel, &transfer_ability);
+    gdma_set_transfer_ability(rx_channel, &transfer_ability);
 }
 #endif //SOC_GDMA_SUPPORT_PSRAM

@@ -137,7 +135,7 @@ esp_err_t esp_crypto_shared_gdma_start(const lldesc_t *input, const lldesc_t *ou
        return ESP_ERR_INVALID_ARG;
    }

-  /* tx channel is reset by gdma_connect(), also reset rx to ensure a known state */
+    /* tx channel is reset by gdma_connect(), also reset rx to ensure a known state */
    gdma_get_channel_id(tx_channel, &rx_ch_id);
    gdma_ll_rx_reset_channel(&GDMA, rx_ch_id);