本文转载自:亦梦云烟微信公众号
1. 什么是DMA
DMA是直接内存访问(Direct Memory Access),DMA引擎可以将数据从一个地方传输到另一个地方,在传输过程中不经过CPU的控制。最简单的DMA用法是将数据从内存的一个区域搬运到另一个区域。DMA也可以将外设的数据(如ADC)搬运到内存中,或者将内存数据搬运到外设中(如DAC)。
Zynq-7000系列器件PS端的DMA控制器采用ARM的IP核DMA-330(PL-330)实现。
开发环境
1.1 结构特点
DMA控制器具有以下的特点:
1. 8个独立的通道,4个可用于PL—PS间数据管理,每个通道有1024Byte的MFIFO;
2. 使用CPU_2x 时钟搬运数据,CPU_2x = (CPU frq/6)*2;
3. 执行自定义内存区域内的DMA指令运行DMA;
4. AHB控制寄存器支持安全和非安全模式;
5. 每个通道内置4字Cache;
6. 可以访问SoC的以下映射物理地址:
DDR、OCM、PL、Linear QSPI Read、SMC和M_AXI_GP设备,访问设备的互联结构如图1所示。
图1 DMA PS结构示意图
1.2 Zynq 访问互联结构图
从图1可以看出DMA控制器可以访问连接到Central Interconnect上的所有设备,并提供了四个通道的外设管理接口可用于控制PL的数据搬运。
Zynq系列器件中DMA控制器采用ARM PL-330 IP和r1p1版,结构框图如图2所示。
图2 ZYNQ DMA控制器结构示意图
如图2所示,DMA控制器由指令加速引擎,AXI Master数据接口,AXI APB寄存器访问接口以及可以连接到PL的外设请求接口,数据缓冲FIFO和控制及状态产生单元组成。
从图2可以看到,DMA PL330的设计思想是:DMA控制器通过DMA指令执行引擎执行自己的指令,并将执行状态通过APB总线和中断等形式反馈给CPU,达到数据搬运不占用CPU的目的。
DMA控制器共有八个通道,其中四个通道负责互联到Central Interconnectcun存储单元上的数据搬运;四个数据通道为外设请求接口,可用于PL AXI互联接口的数据访问管理。
每个DMA通道都执行自己的指令,拥有自己的独立线程,通道间互不影响。指令执行引擎有自己独立的Cache线。
2. 实例测试
首先构建AXI DMA例程使用的硬件环境,如图3所示,ZYNQ通过GP0端口读取Block RAM数据。
图3 Block RAM硬件结构
2.1 测试硬件完整性
首先使用SDK测试硬件的完成整性,编写如下代码测试BRAM读写情况。
#include <stdio.h> #include "platform.h" #include "xil_printf.h" #include "xtime_l.h" #include "xparameters.h" void TC_BRAM(); #define RAM_W XPAR_AXI_BRAM_CTRL_0_S_AXI_BASEADDR #define RAM_R XPAR_AXI_BRAM_CTRL_1_S_AXI_BASEADDR int main() { init_platform(); TC_BRAM(); cleanup_platform(); return 0; } void TC_BRAM() { printf("test for block RAM\n"); XTime tb, te; double dt = 0.0; XTime_SetTime(0); for(int i=0; i<4*1024; i++) { *(int *)(RAM_W+4*i) = i; } XTime_GetTime(&tb); for(int i=0; i<4*1024; i++) { if(*(int *)(RAM_R+4*i) != i) { printf("Test Failed\n"); break; } } XTime_GetTime(&te); printf("Test pass\n"); dt = (te-tb)*1000000/COUNTS_PER_SECOND; printf("%fus\n",dt); printf("test for block RAM end!\n"); }
在串口终端中如果没有输出"Test Failed"则说明硬件设计无误。
2.2 测试内存读取速度
在使用DMA之前,首先在不使用DMA的情况下测试内存读取的速度。本例程首先写入0~4095,然后全部读取出来。
指针循环访问:
void TC_PointerSpeed() { XTime tb, te; double dt = 0.0; int a[4*1024]; XTime_SetTime(0); for(int i=0; i<4*1024; i++) { *(int *)(RAM_W+4*i) = i; } XTime_GetTime(&tb); for(int i=0; i<4*1024; i++) { a[i] = *(int *)(RAM_R+4*i); } XTime_GetTime(&te); dt = (te-tb)*1000000/COUNTS_PER_SECOND; printf("%fus\n",dt); }
memcpy:
void TC_MemcpySpeed() { XTime tb, te; double dt = 0.0; int a[4*1024]; XTime_SetTime(0); for(int i=0; i<4*1024; i++) { *(int *)(RAM_W+4*i) = i; } XTime_GetTime(&tb); memcpy(a, (void*)RAM_R, 4*1024*4); XTime_GetTime(&te); dt = (te-tb)*1000000/COUNTS_PER_SECOND; printf("%fus\n",dt); }
速度如下表所示。
可以看出使用CPU进行的内存复制效率非常低。
3. DMAPS应用
3.1 编程模型
本文不考虑外设请求接口,DMA控制器编程分为以下几个部分:
1. DMA控制器初始化;
2. 组织DMA引擎执行代码;
3. 启动或停止DMA传输;
4. 异常处理。
官方例程在Vivado安装路径下:
Vivado2018.2\SDK\2018.2\data\embeddedsw\XilinxProcessorIPLib\drivers\dmaps_v2_3\examples
#include
#include "platform.h"
#include "xil_printf.h"
#include "sleep.h"
#include "xparameters.h"
#include "xil_types.h"
#include "xil_assert.h"
#include "xil_io.h"
#include "xil_exception.h"
#include "xil_cache.h"
#include "xil_printf.h"
#include "xscugic.h"
#include "xdmaps.h"
/************************** Constant Definitions *****************************/
/*
* The following constants map to the XPAR parameters created in the
* xparameters.h file. They are defined here such that a user can easily
* change all the needed parameters in one place.
*/
#define DMA_DEVICE_ID XPAR_XDMAPS_1_DEVICE_ID
#define INTC_DEVICE_ID XPAR_SCUGIC_SINGLE_DEVICE_ID
#define DMA_DONE_INTR_0 XPAR_XDMAPS_0_DONE_INTR_0
#define DMA_DONE_INTR_1 XPAR_XDMAPS_0_DONE_INTR_1
#define DMA_DONE_INTR_2 XPAR_XDMAPS_0_DONE_INTR_2
#define DMA_DONE_INTR_3 XPAR_XDMAPS_0_DONE_INTR_3
#define DMA_DONE_INTR_4 XPAR_XDMAPS_0_DONE_INTR_4
#define DMA_DONE_INTR_5 XPAR_XDMAPS_0_DONE_INTR_5
#define DMA_DONE_INTR_6 XPAR_XDMAPS_0_DONE_INTR_6
#define DMA_DONE_INTR_7 XPAR_XDMAPS_0_DONE_INTR_7
#define DMA_FAULT_INTR XPAR_XDMAPS_0_FAULT_INTR
#define TEST_ROUNDS 1 /* Number of loops that the Dma transfers run.*/
#define DMA_LENGTH 1024 /* Length of the Dma Transfers */
#define TIMEOUT_LIMIT 0x2000 /* Loop count for timeout */
/************************** Function Prototypes ******************************/
int XDmaPs_Example_W_Intr(XScuGic *GicPtr, u16 DeviceId);
int SetupInterruptSystem(XScuGic *GicPtr, XDmaPs *DmaPtr);
void DmaDoneHandler(unsigned int Channel, XDmaPs_Cmd *DmaCmd,
void *CallbackRef);
/************************** Variable Definitions *****************************/
#ifdef __ICCARM__
#pragma data_alignment=32
static int Src[DMA_LENGTH];
static int Dst[DMA_LENGTH];
#pragma data_alignment=4
#else
static int Src[DMA_LENGTH] __attribute__ ((aligned (32)));
static int Dst[DMA_LENGTH] __attribute__ ((aligned (32)));
#endif
XDmaPs DmaInstance;
#ifndef TESTAPP_GEN
XScuGic GicInstance;
#endif
#ifndef TESTAPP_GEN
int main(void)
{
int Status;
Status = XDmaPs_Example_W_Intr(&GicInstance,DMA_DEVICE_ID);
if (Status != XST_SUCCESS) {
xil_printf("Error: XDMaPs_Example_W_Intr failed\r\n");
return XST_FAILURE;
}
xil_printf("Successfully ran XDMaPs_Example_W_Intr\r\n");
return XST_SUCCESS;
}
#endif
/*****************************************************************************/
/**
*
* Interrupt Example to test the DMA.
*
* @param DeviceId is the Device ID of the DMA controller.
*
* @return XST_SUCCESS to indicate success, otherwise XST_FAILURE.
*
* @note None.
*
****************************************************************************/
int XDmaPs_Example_W_Intr(XScuGic *GicPtr, u16 DeviceId)
{
int Index;
unsigned int Channel = 0;
int Status;
int TestStatus;
int TestRound;
int TimeOutCnt;
volatile int Checked[XDMAPS_CHANNELS_PER_DEV];
XDmaPs_Config *DmaCfg;
XDmaPs *DmaInst = &DmaInstance;
XDmaPs_Cmd DmaCmd;
memset(&DmaCmd, 0, sizeof(XDmaPs_Cmd));
DmaCmd.ChanCtrl.SrcBurstSize = 4;
DmaCmd.ChanCtrl.SrcBurstLen = 4;
DmaCmd.ChanCtrl.SrcInc = 1;
DmaCmd.ChanCtrl.DstBurstSize = 4;
DmaCmd.ChanCtrl.DstBurstLen = 4;
DmaCmd.ChanCtrl.DstInc = 1;
DmaCmd.BD.SrcAddr = (u32) Src;
DmaCmd.BD.DstAddr = (u32) Dst;
DmaCmd.BD.Length = DMA_LENGTH * sizeof(int);
/*
* Initialize the DMA Driver
*/
DmaCfg = XDmaPs_LookupConfig(DeviceId);
if (DmaCfg == NULL) {
return XST_FAILURE;
}
Status = XDmaPs_CfgInitialize(DmaInst,
DmaCfg,
DmaCfg->BaseAddress);
if (Status != XST_SUCCESS) {
return XST_FAILURE;
}
/*
* Setup the interrupt system.
*/
Status = SetupInterruptSystem(GicPtr, DmaInst);
if (Status != XST_SUCCESS) {
return XST_FAILURE;
}
TestStatus = XST_SUCCESS;
for (TestRound = 0; TestRound < TEST_ROUNDS; TestRound++) {
xil_printf("Test round %d\r\n", TestRound);
for (Channel = 0;
Channel < XDMAPS_CHANNELS_PER_DEV;
Channel++) {
/* Initialize source */
for (Index = 0; Index < DMA_LENGTH; Index++)
Src[Index] = DMA_LENGTH - Index;
/* Clear destination */
for (Index = 0; Index < DMA_LENGTH; Index++)
Dst[Index] = 0;
Checked[Channel] = 0;
/* Set the Done interrupt handler */
XDmaPs_SetDoneHandler(DmaInst,
Channel,
DmaDoneHandler,
(void *)Checked);
Status = XDmaPs_Start(DmaInst, Channel, &DmaCmd, 0);
if (Status != XST_SUCCESS) {
return XST_FAILURE;
}
TimeOutCnt = 0;
/* Now the DMA is done */
while (!Checked[Channel]
&& TimeOutCnt < TIMEOUT_LIMIT) {
TimeOutCnt++;
}
if (TimeOutCnt >= TIMEOUT_LIMIT) {
TestStatus = XST_FAILURE;
}
if (Checked[Channel] < 0) {
/* DMA controller failed */
TestStatus = XST_FAILURE;
}
}
}
return TestStatus;
}
/******************************************************************************/
/**
*
* This function connects the interrupt handler of the interrupt controller to
* the processor. This function is seperate to allow it to be customized for
* each application. Each processor or RTOS may require unique processing to
* connect the interrupt handler.
*
* @param GicPtr is the GIC instance pointer.
* @param DmaPtr is the DMA instance pointer.
*
* @return None.
*
* @note None.
*
****************************************************************************/
int SetupInterruptSystem(XScuGic *GicPtr, XDmaPs *DmaPtr)
{
int Status;
#ifndef TESTAPP_GEN
XScuGic_Config *GicConfig;
Xil_ExceptionInit();
/*
* Initialize the interrupt controller driver so that it is ready to
* use.
*/
GicConfig = XScuGic_LookupConfig(INTC_DEVICE_ID);
if (NULL == GicConfig) {
return XST_FAILURE;
}
Status = XScuGic_CfgInitialize(GicPtr, GicConfig,
GicConfig->CpuBaseAddress);
if (Status != XST_SUCCESS) {
return XST_FAILURE;
}
/*
* Connect the interrupt controller interrupt handler to the hardware
* interrupt handling logic in the processor.
*/
Xil_ExceptionRegisterHandler(XIL_EXCEPTION_ID_IRQ_INT,
(Xil_ExceptionHandler)XScuGic_InterruptHandler,
GicPtr);
#endif
/*
* Connect the device driver handlers that will be called when an interrupt
* for the device occurs, the device driver handler performs the specific
* interrupt processing for the device
*/
/*
* Connect the Fault ISR
*/
Status = XScuGic_Connect(GicPtr,
DMA_FAULT_INTR,
(Xil_InterruptHandler)XDmaPs_FaultISR,
(void *)DmaPtr);
if (Status != XST_SUCCESS) {
return XST_FAILURE;
}
/*
* Connect the Done ISR for all 8 channels of DMA 0
*/
Status = XScuGic_Connect(GicPtr,
DMA_DONE_INTR_0,
(Xil_InterruptHandler)XDmaPs_DoneISR_0,
(void *)DmaPtr);
Status |= XScuGic_Connect(GicPtr,
DMA_DONE_INTR_1,
(Xil_InterruptHandler)XDmaPs_DoneISR_1,
(void *)DmaPtr);
Status |= XScuGic_Connect(GicPtr,
DMA_DONE_INTR_2,
(Xil_InterruptHandler)XDmaPs_DoneISR_2,
(void *)DmaPtr);
Status |= XScuGic_Connect(GicPtr,
DMA_DONE_INTR_3,
(Xil_InterruptHandler)XDmaPs_DoneISR_3,
(void *)DmaPtr);
Status |= XScuGic_Connect(GicPtr,
DMA_DONE_INTR_4,
(Xil_InterruptHandler)XDmaPs_DoneISR_4,
(void *)DmaPtr);
Status |= XScuGic_Connect(GicPtr,
DMA_DONE_INTR_5,
(Xil_InterruptHandler)XDmaPs_DoneISR_5,
(void *)DmaPtr);
Status |= XScuGic_Connect(GicPtr,
DMA_DONE_INTR_6,
(Xil_InterruptHandler)XDmaPs_DoneISR_6,
(void *)DmaPtr);
Status |= XScuGic_Connect(GicPtr,
DMA_DONE_INTR_7,
(Xil_InterruptHandler)XDmaPs_DoneISR_7,
(void *)DmaPtr);
if (Status != XST_SUCCESS)
return XST_FAILURE;
/*
* Enable the interrupts for the device
*/
XScuGic_Enable(GicPtr, DMA_DONE_INTR_0);
XScuGic_Enable(GicPtr, DMA_DONE_INTR_1);
XScuGic_Enable(GicPtr, DMA_DONE_INTR_2);
XScuGic_Enable(GicPtr, DMA_DONE_INTR_3);
XScuGic_Enable(GicPtr, DMA_DONE_INTR_4);
XScuGic_Enable(GicPtr, DMA_DONE_INTR_5);
XScuGic_Enable(GicPtr, DMA_DONE_INTR_6);
XScuGic_Enable(GicPtr, DMA_DONE_INTR_7);
XScuGic_Enable(GicPtr, DMA_FAULT_INTR);
Xil_ExceptionEnable();
return XST_SUCCESS;
}
/*****************************************************************************/
/**
*
* DmaDoneHandler.
*
* @param Channel is the Channel number.
* @param DmaCmd is the Dma Command.
* @param CallbackRef is the callback reference data.
*
* @return None.
*
* @note None.
*
******************************************************************************/
void DmaDoneHandler(unsigned int Channel, XDmaPs_Cmd *DmaCmd, void *CallbackRef)
{
/* done handler */
volatile int *Checked = (volatile int *)CallbackRef;
int Index;
int Status = 1;
int *Src;
int *Dst;
Src = (int *)DmaCmd->BD.SrcAddr;
Dst = (int *)DmaCmd->BD.DstAddr;
/* DMA successful */
/* compare the src and dst buffer */
for (Index = 0; Index < DMA_LENGTH; Index++) {
if ((Src[Index] != Dst[Index]) ||
(Dst[Index] != DMA_LENGTH - Index)) {
Status = -XST_FAILURE;
}
}
Checked[Channel] = Status;
}
3.2 修改DMA PS
修改DMA配置,使其将PL中的数据传输到内存中。
修改DMA的源地址:
DmaCmd.BD.SrcAddr = (u32) RAM_R;
测量DMA传输16KB数据,时间约为180us,远远高于memcpy。
4. Linux DMA驱动
4.1 编程方法
配置DMA
void dma_init(u32 s, int size)
{
dma_cap_mask_t mask;
//alloc 512B src memory and dst memory
dma_src = s;
printk(KERN_INFO "dma_src = 0x%x\n",src);
//src = dma_alloc_coherent(NULL, MM_SIZE, &dma_src, GFP_KERNEL);
dst = dma_alloc_coherent(NULL, size, &dma_dst, GFP_KERNEL);
printk(KERN_INFO "dst = 0x%x, dma_dst = 0x%x\n",dst, dma_dst);
dma_cap_zero(mask);
dma_cap_set(DMA_MEMCPY, mask);//direction:memory to memory
chan = dma_request_channel(mask,NULL,NULL); //request a dma channel
printk(KERN_INFO "dma channel id = %d\n",chan->chan_id);
flags = DMA_CTRL_ACK | DMA_PREP_INTERRUPT;
dev = chan->device;
}
释放DMA
void dma_del(void)
{
//free memory and dma channel
dma_free_coherent(NULL, MM_SIZE, dst, &dma_dst);
dma_release_channel(chan);
}
向DMA引擎发起一个传输请求
void dma_read(u32 dst,u32 src,int size)
{
//alloc a desc,and set dst_addr,src_addr,data_size.
/*获取时间*/
do_gettimeofday(&tb);
tx = dev->device_prep_dma_memcpy(chan, dst, src, size, flags);
if (!tx){
printk(KERN_INFO "Failed to prepare DMA memcpy");
}
tx->callback = dma_callback_func;//set call back function
tx->callback_param = NULL;
cookie = tx->tx_submit(tx); //submit the desc
if (dma_submit_error(cookie)){
printk(KERN_INFO "Failed to do DMA tx_submit");
}
dma_async_issue_pending(chan);//begin dma transfer
}
4.2 实例代码 #include #define DEVICE_NAME "dma_driver" struct timeval tb, te; #define MM_SIZE (1440*10) void dma_callback_func(void *dma_async_param); struct dma_chan *chan; //When dma transfer finished,this function will be called. printk("memcpy\n"); printk("DMA transfer finished!\n\r"); void dma_read(u32 dst,u32 src,int size) tx->callback = dma_callback_func;//set call back function void dma_init(u32 s, int size) dma_cap_zero(mask); flags = DMA_CTRL_ACK | DMA_PREP_INTERRUPT; void dma_del(void) static int device_open(struct inode *inode, struct file *file) static int device_close(struct inode *indoe, struct file *file) static ssize_t device_read(struct file *filp, char __user *buf, size_t size, loff_t *ppos) dma_read(dma_dst, dma_src, MM_SIZE); return ret; static struct file_operations device_fops = static struct miscdevice MMAP_misc = static int __init char_device_init( void ) CaptureReadAddr0 = (volatile unsigned int*)ioremap(ImageReadAddress0, 1440*10); printk("init module\n"); src = (char*)CaptureReadAddr0; return 0; static void __exit char_device_exit( void ) iounmap(CaptureReadAddr0); MODULE_LICENSE("GPL"); module_init(char_device_init);//模块加载 使用DMA搬运和memcpy搬运PL中的数据速度对比如下: DMA搬运消耗了111us,而memcpy需要使用424us,可见DMA速度远高于CPU对数据的搬运。
将Block RAM中的数据先使用ioremap映射的地址src,写入一些字符,然后使用DMA从Block RAM中传输16KB数据到分配的内存dst中。传输完成后调用dma_callback_func函数,在该函数中比较传输的数据和发送的数据是否相同,并测量DMA消耗的时间。
#include
#include
#define ImageReadAddress0 0x40000000
volatile unsigned int *CaptureReadAddr0;
void dma_read(u32 dma_dst,u32 dma_src,int size);
void dma_init(u32 s, int size);
void dma_del(void);
//bus address
dma_addr_t dma_src;
dma_addr_t dma_dst;
//virtual address
char *src = NULL;
char *dst = NULL ;
struct dma_device *dev;
struct dma_async_tx_descriptor *tx = NULL;
enum dma_ctrl_flags flags;
dma_cookie_t cookie;
void dma_callback_func(void *dma_async_param)
{
int i=0;
do_gettimeofday(&te);
printk("DMA\n");
printk("T:%ld, %ld\n", tb.tv_sec, tb.tv_usec);
printk("T2:%ld, %ld\n", te.tv_sec, te.tv_usec);
printk(KERN_ALERT "time use:%ld, %ld\n",
(te.tv_sec-tb.tv_sec),
(te.tv_usec-tb.tv_usec));
do_gettimeofday(&tb);
memcpy(dst ,src, MM_SIZE);
do_gettimeofday(&te);
printk("T:%ld, %ld\n", tb.tv_sec, tb.tv_usec);
printk("T2:%ld, %ld\n", te.tv_sec, te.tv_usec);
printk(KERN_ALERT "time use:%ld, %ld\n",
(te.tv_sec-tb.tv_sec),
(te.tv_usec-tb.tv_usec));
for(i=0; i
if(*(dst + i) != (char)('a' + i%26))
{
printk("Failed\n");
return;
}
}
printk("PASS\n");
}
{
//alloc a desc,and set dst_addr,src_addr,data_size.
/*获取时间*/
do_gettimeofday(&tb);
tx = dev->device_prep_dma_memcpy(chan, dst, src, size, flags);
if (!tx){
printk(KERN_INFO "Failed to prepare DMA memcpy");
}
tx->callback_param = NULL;
cookie = tx->tx_submit(tx); //submit the desc
if (dma_submit_error(cookie)){
printk(KERN_INFO "Failed to do DMA tx_submit");
}
dma_async_issue_pending(chan);//begin dma transfer
}
{
dma_cap_mask_t mask;
//alloc 512B src memory and dst memory
dma_src = s;
printk(KERN_INFO "dma_src = 0x%x\n",src);
//src = dma_alloc_coherent(NULL, MM_SIZE, &dma_src, GFP_KERNEL);
dst = dma_alloc_coherent(NULL, size, &dma_dst, GFP_KERNEL);
printk(KERN_INFO "dst = 0x%x, dma_dst = 0x%x\n",dst, dma_dst);
dma_cap_set(DMA_SLAVE, mask);//direction:memory to memory
chan = dma_request_channel(mask,NULL,NULL); //request a dma channel
printk(KERN_INFO "dma channel id = %d\n",chan->chan_id);
dev = chan->device;
}
{
//free memory and dma channel
dma_free_coherent(NULL, MM_SIZE, dst, &dma_dst);
dma_release_channel(chan);
}
{
return 0;
}
{
printk("device close\n");
return 0;
}
{
int ret = 0;
}
{
.owner = THIS_MODULE,
.open = device_open,
.release = device_close,
.read = device_read,
};
{
.minor = MISC_DYNAMIC_MINOR,
.name = DEVICE_NAME,
.fops = &device_fops,
};
{
int ret=0;
int i = 0;
printk("init module\n");
ret = misc_register(&MMAP_misc);
if(ret)
{
printk("Error:misc_register failed!\n");
return 0;
}
dma_init(ImageReadAddress0, MM_SIZE);
for (i = 0; i < MM_SIZE; i++){
*(src + i) = (char)('a' + i%26);
}
}
{
printk(KERN_ALERT"module exit\n");
misc_deregister(&MMAP_misc);
dma_del();
}
MODULE_AUTHOR("DMA_test");
module_exit(char_device_exit);//模块退出
Z-turn# ./test
Test for dma
DMA
T:34, 358179
T2:34, 358290
time use:0, 111
memcpy
T:34, 364372
T2:34, 364796
time use:0, 424
DMA transfer finished!
PASS