Vitis + ZCU104案例教程

guanxiao_505740 在周四, 07/22/2021 - 11:13 提交

作者：弱咩咩

版权声明：本文为博主原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接和本声明。
本文链接：https://blog.csdn.net/qq_39229006/article/details/108821786

参考：https://www.xilinx.com/html_docs/xilinx2020_1/vitis_doc/dvy1591145410207...

开发环境：
操作系统：Ubuntu18.04

开发平台：Xilinx ZCU104

开发工具：Vitis 2020.1

镜像烧写工具：balenaEtcher

串口调试工具：minicom

1.准备工作
1.1安装OpenCL客户端驱动加载程序：
sudo apt-get install ocl-icd-libopencl1 opencl-headers ocl-icd-opencl-dev

1.2安装需要的软件包：
sudo add-apt-repository ppa:xorg-edgers/ppa
sudo apt-get update
sudo apt-get install libgl1-mesa-glx
sudo apt-get install libgl1-mesa-dri
sudo apt-get install libgl1-mesa-dev
sudo add-apt-repository --remove ppa:xorg-edgers/ppa
sudo apt install net-tools
sudo apt-get install -y unzip
sudo apt install gcc
sudo apt install g++
sudo apt install python
ln -s /usr/bin/python2 /usr/bin/python

1.3安装Vitis软件平台：
https://www.xilinx.com/support/download/index.html/content/xilinx/en/dow...

1.4安装Xilinx运行时：
https://china.xilinx.com/support/download/index.html/content/xilinx/zh/d...

1.5安装目标平台文件(ZCU104 Base 2020.1 )：
https://china.xilinx.com/support/download/index.html/content/xilinx/zh/d...

将文件解压并放到/opt/xilinx/platforms/目录下

1.6设置Vitis和XRT环境变量：
sudo gedit ./.bashrc

在文件末尾加上：
#setup XILINX_VITIS and XILINX_VIVADO variables
source /Vitis/2020.1/settings64.sh
#setup XILINX_XRT
source /opt/xilinx/xrt/setup.sh

1.7下载嵌入式平台常用镜像：
https://china.xilinx.com/support/download/index.html/content/xilinx/zh/d...

下载文件：ZYNQMP通用映像（其他平台下载相应的镜像）

将文件解压到/opt/xilinx/目录下，解压后的文件名为xilinx-zynqmp-common-v2020.1，进入该文件，解压rootfs.ext4.gz文件
sudo gunzip ./rootfs.ext4.gz

解压后的文件名为：rootfs.ext4，接着为了得到平台所需要的sysroot文件，在当前文件目录下运行：
./sdk.sh -y -dir ./ -p

执行完成后得到文件夹ir。

至此，准备工作已经基本完成。

2.执行案例
命令行运行vitis，打开Vitis 2020.1，设置好Vitis工作空间后，点击File->new->Application Project…，在弹出界面点击Next，在Platform界面选择xilinx_zcu104_base_202010_1，点击Next，

输入项目名称：例如：test，点击Next，

在Application settings中设置Sysroot path、Root FS、Kernel Image：

Sysroot path：/opt/xilinx/xilinx-zynqmp-common-v2020.1/ir/sysroots/aarch64-xilinx-linux

Root FS：/opt/xilinx/xilinx-zynqmp-common-v2020.1/rootfs.ext4

Kernel Image：/opt/xilinx/xilinx-zynqmp-common-v2020.1/Image

点击Next，

选择Empty Application，点击Finish。

在test_system->test->src右键new->File，创建内核函数，输入文件名，如:kernel_test.cpp，需要注意内核函数名不能与关键词同名

kernel_test.cpp:

#define BUFFER_SIZE 256
extern "C" {

void kernel_test(
int* a,
int* b,
int* c,
const int n_elements)
{

#pragma HLS INTERFACE m_axi offset=SLAVE bundle=gmem port=a max_read_burst_length = 256
#pragma HLS INTERFACE m_axi offset=SLAVE bundle=gmem port=b max_read_burst_length = 256
#pragma HLS INTERFACE m_axi offset=SLAVE bundle=gmem1 port=c max_write_burst_length = 256

#pragma HLS INTERFACE s_axilite port=a bundle=control
#pragma HLS INTERFACE s_axilite port=b bundle=control
#pragma HLS INTERFACE s_axilite port=c bundle=control

#pragma HLS INTERFACE s_axilite port=n_elements bundle=control
#pragma HLS INTERFACE s_axilite port=return bundle=control

int arrayA[BUFFER_SIZE];
int arrayB[BUFFER_SIZE];

for (int i = 0 ; i < n_elements ; i += BUFFER_SIZE)
{
int size = BUFFER_SIZE;
//boundary check
if (i + size > n_elements) size = n_elements - i;

//Burst reading A and B
readA: for (int j = 0 ; j < size ; j++) {
#pragma HLS pipeline ii = 1 rewind
arrayA[j] = a[i+j];
arrayB[j] = b[i+j];
}

//Burst reading B and calculating C and Burst writing
// to Global memory
vadd_wrteC: for (int j = 0 ; j < size ; j++){
#pragma HLS pipeline ii = 1 rewind
c[i+j] = arrayA[j] + arrayB[j];
}
}

}
}

该内核函数简单实现了一个c=a+b的功能，运用了HLS的一些语法和优化。

然后再用一样的方法创建主机程序：host.cpp与host.h

host.cpp:
#include
#include
#include
#include "host.h"

static const int DATA_SIZE = 4096;

static const std::string error_message =
"Error: Result mismatch:\n"
"i = %d CPU result = %d Device result = %d\n";

int main(int argc, char* argv[]) {

//TARGET_DEVICE macro needs to be passed from gcc command line
if(argc != 2) {
std::cout << "Usage: " << argv[0] <<" " << std::endl;
return EXIT_FAILURE;
}

char* xclbinFilename = argv[1];

// Compute the size of array in bytes
size_t size_in_bytes = DATA_SIZE * sizeof(int);

// Creates a vector of DATA_SIZE elements with an initial value of 10 and 32
// using customized allocator for getting buffer alignment to 4k boundary

std::vector<:device> devices;
cl::Device device;
std::vector<:platform> platforms;
bool found_device = false;

//traversing all Platforms To find Xilinx Platform and targeted
//Device in Xilinx Platform
cl::Platform::get(&platforms);
for(size_t i = 0; (i < platforms.size() ) & (found_device == false) ;i++){
cl::Platform platform = platforms[i];
std::string platformName = platform.getInfo();
if ( platformName == "Xilinx"){
devices.clear();
platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices);
if (devices.size()){
device = devices[0];
found_device = true;
break;
}
}
}
if (found_device == false){
std::cout << "Error: Unable to find Target Device "
<< device.getInfo() << std::endl;
return EXIT_FAILURE;
}

// Creating Context and Command Queue for selected device
cl::Context context(device);
cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE);

// Load xclbin
std::cout << "Loading: '" << xclbinFilename << "'\n";
std::ifstream bin_file(xclbinFilename, std::ifstream::binary);
bin_file.seekg (0, bin_file.end);
unsigned nb = bin_file.tellg();
bin_file.seekg (0, bin_file.beg);
char *buf = new char [nb];
bin_file.read(buf, nb);

// Creating Program from Binary File
cl::Program::Binaries bins;
bins.push_back({buf,nb});
devices.resize(1);
cl::Program program(context, devices, bins);

// This call will get the kernel object from program. A kernel is an
// OpenCL function that is executed on the FPGA.
cl::Kernel krnl_vector_add(program,"kernel_test");

// These commands will allocate memory on the Device. The cl::Buffer objects can
// be used to reference the memory locations on the device.
cl::Buffer buffer_a(context, CL_MEM_READ_ONLY, size_in_bytes);
cl::Buffer buffer_b(context, CL_MEM_READ_ONLY, size_in_bytes);
cl::Buffer buffer_result(context, CL_MEM_WRITE_ONLY, size_in_bytes);

//set the kernel Arguments
int narg=0;
krnl_vector_add.setArg(narg++,buffer_a);
krnl_vector_add.setArg(narg++,buffer_b);
krnl_vector_add.setArg(narg++,buffer_result);
krnl_vector_add.setArg(narg++,DATA_SIZE);

//We then need to map our OpenCL buffers to get the pointers
int *ptr_a = (int *) q.enqueueMapBuffer (buffer_a , CL_TRUE , CL_MAP_WRITE , 0, size_in_bytes);
int *ptr_b = (int *) q.enqueueMapBuffer (buffer_b , CL_TRUE , CL_MAP_WRITE , 0, size_in_bytes);
int *ptr_result = (int *) q.enqueueMapBuffer (buffer_result , CL_TRUE , CL_MAP_READ , 0, size_in_bytes);

//setting input data
for(int i = 0 ; i< DATA_SIZE; i++){
ptr_a[i] = 10;
ptr_b[i] = 20;
}

// Data will be migrated to kernel space
q.enqueueMigrateMemObjects({buffer_a,buffer_b},0/* 0 means from host*/);

//Launch the Kernel
q.enqueueTask(krnl_vector_add);

// The result of the previous kernel execution will need to be retrieved in
// order to view the results. This call will transfer the data from FPGA to
// source_results vector
q.enqueueMigrateMemObjects({buffer_result},CL_MIGRATE_MEM_OBJECT_HOST);

q.finish();

//Verify the result
int match = 0;
for (int i = 0; i < DATA_SIZE; i++) {
int host_result = ptr_a[i] + ptr_b[i];
if (ptr_result[i] != host_result) {
printf(error_message.c_str(), i, host_result, ptr_result[i]);
match = 1;
break;
}
}

q.enqueueUnmapMemObject(buffer_a , ptr_a);
q.enqueueUnmapMemObject(buffer_b , ptr_b);
q.enqueueUnmapMemObject(buffer_result , ptr_result);
q.finish();

std::cout << "TEST " << (match ? "FAILED" : "PASSED") << std::endl;
return (match ? EXIT_FAILURE : EXIT_SUCCESS);

}

host.h:
#pragma once

#define CL_HPP_CL_1_2_DEFAULT_BUILD
#define CL_HPP_TARGET_OPENCL_VERSION 120
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1

#include

//Customized buffer allocation for 4K boundary alignment
template
struct aligned_allocator
{
using value_type = T;
T* allocate(std::size_t num)
{
void* ptr = nullptr;
if (posix_memalign(&ptr,4096,num*sizeof(T)))
throw std::bad_alloc();
return reinterpret_cast(ptr);
}
void deallocate(T* p, std::size_t num)
{
free(p);
}
};

该主机函数负责将内核函数传送到设备上执行，并验证结果是否正确。

在Application Project Settings界面，添加Hardware Funcation，点击，添加二进制容器，点击，添加kernel_test函数，并设置Active build configuration:Hardware

在Assistant界面右键Hardware->Build进行编译，编译成功后在工程目录下生成如下文件：

其中sd_card.img用来烧写进SD卡的镜像

运用镜像烧写工具将sd_card.img烧写进SD卡，插上SD卡启动ZCU104,通过串口连接将显示以下信息：

执行以下命令：
cd /mnt/sd-mncblk0p1
source ./init.sh

运行程序：
./test binary_container_1.xclbin

显示如下：

显示TEST PASSED，成功