背景:在zynqNet项目之中,程序到底如何分配DRAM上的地址作为global Memory。以及如何分配相应程序的内存。
CPU端的函数与作用
FPGA端函数的作用
按照axi-lite传进来的层信息,从DRAM上读取数据,进行运算
四种on-chip cache(report 4.2.4)
processing_elements和memory_controller
注意netconfig与network不仅在CPU端定义中有运用到,在FPGA端的;定义之中也有用到。
一、CPU端对DRAM的定义
1.1 关于DRAM指针的全局变量
DRAM作为ARM与FPGA的共享的内存空间,然后运用变量传递参数。
// = Global Variables (Memory Pointers) =
// Pointers to Shared DRAM Memory
char *SHARED_DRAM;
// layer_t *SHARED_DRAM_LAYER_CONFIG;
float *SHARED_DRAM_LAYER_CONFIG;
data_t *SHARED_DRAM_WEIGHTS;
data_t *SHARED_DRAM_DATA;
在main函数外定义的全局变量,FPGA与CPU之间就是运用此变量共享指向DRAM的指针,从而实现权重与数值的传递。
1.2 定义DRAM指针的函数
//main
// Allocate Shared Memory in DRAM for Weights + Data.
allocate_DRAM_memory(net_CPU);
// Copy Layer Weights to DRAM.
copy_weights_to_DRAM(net_CPU);
// ===========================
// = Load + Copy Input Image =
// ===========================
layer_t input_layer = net_CPU->layers[0];
// Allocate Memory for Input Image:
data_t *input_image = allocate_image_memory(input_layer);
// Load Input Image
load_prepared_input_image(input_layer, input_image, input_filename);
// Copy Input Image into shared DRAM
copy_input_image_to_DRAM(input_layer, input_image);
allocate_DRAM_memory(net_CPU);根据分配DRAM的地址,copy_weights_to_DRAM将net_CPU->weights中的权重读到DRAM中。注意,weights也是先从文件读到buffer,然后再由buffer读到DRAM之中的。
load_prepared_input_image和copy_input_image_to_DRAM分别是将图像的数据从file存到buffer和从buffer读到DRAM之中。
// in cpu_top.cpp
void allocate_DRAM_memory(network_t *net_CPU) {
// For Simulation purposes, allocate space on Heap
// For actual HW Implementation, fixed Memory Address in SHARED DRAM is used
// Memory Requirements (Bytes)
int weightsize = net_CPU->num_weights * sizeof(data_t);
int datasize = net_CPU->total_pixel_mem * sizeof(data_t);
// Round memory areas to 32-bit boundaries (4 bytes)
weightsize = std::ceil(weightsize / 4.0) * 4;
datasize = std::ceil(datasize / 4.0) * 4;
int total_size = weightsize + datasize;
// Memory Allocation
if (USE_FPGA_BLOCK) {
// Get Pointer to SHARED DRAM from XFPGA wrapper
SHARED_DRAM = (volatile char *)XFPGA_shared_DRAM_virtual();
} else {
// Allocate SHARED DRAM on Heap
SHARED_DRAM = (volatile char *)malloc(total_size);
}
SHARED_DRAM_WEIGHTS = (volatile data_t *)(SHARED_DRAM);
SHARED_DRAM_DATA = (volatile data_t *)(SHARED_DRAM + weightsize);
根据已有的数据量,若是ARM的内存模拟,则运用malloc函数创建内存,若是运用到FPGA板子,则需要在DRAM上开辟内存空间,所以需要运用SHARED_DRAM = (volatile char *)XFPGA_shared_DRAM_virtual();此函数进行模拟。
// in xfpga.cpp
void XFPGA_Initialize() {
printf("XFPGA Driver: Initialize\n");
axilite_open();
SHARED_DRAM_open();
}
void XFPGA_Release() {
printf("XFPGA Driver: Release\n");
axilite_close();
SHARED_DRAM_close();
}
volatile data_t *XFPGA_shared_DRAM_virtual() {
printf("XFPGA Driver: SHARED_DRAM_virtual() = %X\n", (unsigned long)(SHARED_DRAM_virtual()));
return (volatile data_t*) (SHARED_DRAM_virtual());
}
volatile data_t *XFPGA_shared_DRAM_physical() {
printf("XFPGA Driver: SHARED_DRAM_physical() = %X\n", (unsigned long)(SHARED_DRAM_physical()));
return (volatile data_t*) (SHARED_DRAM_physical());
}
初始化时就运用了全局变量实现了相应的定义。
1.3 定义DRAM底层驱动
// in xfpag.cpp
volatile data_t *XFPGA_shared_DRAM_virtual() {
printf("XFPGA Driver: SHARED_DRAM_virtual() = %X\n", (unsigned long)(SHARED_DRAM_virtual()));
return (volatile data_t*) (SHARED_DRAM_virtual());
}
驱动为CPU端的驱动。SHARED_DRAM = (volatile char *)XFPGA_shared_DRAM_virtual();在函数文件shared_dram在hpp中定义DRAM上的地址位值,然后在cpp之中实现相应的函数。
// in shared_dram.hpp
// Location + Size of SHARED DRAM segment:
// - from Vivado Block Designer (Address Editor):
// AXI M memory bus starts at 0x00000000 – 0xFFFFFFFF, SIZE: 4GB
// - from information by Simon Wright:
// top 128MB of 1GB system memory are not OS-managed
// - from "free -m" on Zynq:
// total mem 882MB -> 118MB not OS-managed
// -> place SHARED_DRAM at 896MB (-> max. activations ~100MB)
// -> 896MB = 896*1024*1024 = 0x3800'0000 bytes
// -> 96MB = 96*1024*1024 = 0x600'0000 bytes
const off_t SHARED_DRAM_BASE_ADDR = 0x38000000;
const size_t SHARED_DRAM_MEM_SIZE = 0x06000000;
extern int SHARED_DRAM_FD;
extern volatile u32* SHARED_DRAM_PTR;
此值可能需要在vivado之中搭建系统时候写入。
1.4 具体驱动实现
axilite.cpp与shared_dram.cpp非常类似,可能是vivado根据实际需要生成的,我们可以暂时理解其内容,然后具体实现时再看。
在程序之外设置了两个全局变量
int SHARED_DRAM_FD = -1;
volatile u32* SHARED_DRAM_PTR = NULL;
1.4.1 SHARED_DRAM_open
// in shared_dram.cpp
bool SHARED_DRAM_open() {
printf("XFPGA Driver: open /dev/mem handle\n");
// Check that it's not yet open
if (SHARED_DRAM_FD > -1) {
printf("SHARED_DRAM already open!\n");
return false;
}
// Memory Map SHARED_DRAM
SHARED_DRAM_PTR = map_SHARED_DRAM(SHARED_DRAM_BASE_ADDR);
printf("SHARED_DRAM_PTR=%X\n", (unsigned long)SHARED_DRAM_PTR);
// Make sure the file handle is really set
return (SHARED_DRAM_FD > -1);
}
volatile u32* map_SHARED_DRAM(off_t base_addr) {
printf("XFPGA Driver: map shared DRAM at base address %X\n", (unsigned long)base_addr);
// make sure that base addr is aligned to memory pages...
base_addr &= ~(getpagesize() - 1);
// Open /dev/mem file (need root privileges or setuid!)
SHARED_DRAM_FD = open("/dev/mem", O_RDWR);
if (SHARED_DRAM_FD < 0) err(errno, "could not open /dev/mem. need to be root");
// Map SHARED_DRAM memory region to pointer
volatile u32* pointer = (u32*)mmap(NULL, SHARED_DRAM_MEM_SIZE, PROT_READ | PROT_WRITE,
MAP_SHARED, SHARED_DRAM_FD, base_addr);
if (pointer == MAP_FAILED) err(errno, "could not map memory for SHARED_DRAM bus");
return pointer;
}
核心语句为三个
SHARED_DRAM_PTR = map_SHARED_DRAM(SHARED_DRAM_BASE_ADDR);
// Open /dev/mem file (need root privileges or setuid!)
SHARED_DRAM_FD = open("/dev/mem", O_RDWR);
volatile u32* pointer = (u32*)mmap(NULL, SHARED_DRAM_MEM_SIZE, PROT_READ | PROT_WRITE,
MAP_SHARED, SHARED_DRAM_FD, base_addr);
运用SHARED_DRAM_FD = open("/dev/mem", O_RDWR);再此打开单片机与DRAM之间的通信。
运用mmap函数实现
1.4.2 mmap
https://baike.baidu.com/item/mmap/1322217?fr=aladdin
https://blog.csdn.net/yangle4695/article/details/52139585
mmap将一个mmap文件或者其它对象映射进内存。文件被映射到多个页上,如果文件的大小不是所有页的大小之和,最后一个页不被使用的空间将会清零。mmap在用户空间映射调用系统中作用很大。
此函数为调用DDR的核心。mmap操作提供了一种机制,让用户程序直接访问设备内存,这种机制,相比较在用户空间和内核空间互相拷贝数据,效率更高。在要求高性能的应用中比较常用。mmap映射内存必须是页面大小的整数倍,面向流的设备不能进行mmap,mmap的实现和硬件有关。
void* mmap(void* start,size_t length,int prot,int flags,int fd,off_t offset);
1.4.3 getpagesize
https://baike.baidu.com/item/getpagesize
1.5 实现思路
CPU端,创建几个全局变量,char *SHARED_DRAM; SHARED_DRAM, SHARED_DRAM_WEIGHTS, SHARED_DRAM_DATA 以此全局变量作为DRAM的指针来实现,对此变量有影响的函数为:
// Initialize AXILITE Configuration Bus + Shared DRAM region
if (USE_FPGA_BLOCK) XFPGA_Initialize();
// Allocate Shared Memory in DRAM for Weights + Data.
allocate_DRAM_memory(net_CPU);
if (USE_FPGA_BLOCK) {
// Set Memory Configuration in FPGA
XFPGA_setDRAMBase(); // physical address!
XFPGA_setWeightsOffset(weights_offset);
XFPGA_setInputOffset(input_offset);
}
XFPGA_Initialize于程序之中嵌套了SHARED_DRAM_open();其中运用open与mmap的方式打开了相应的DRAM内存。
allocate_DRAM_memory之中根据之前打开的DRAM指针,回归了SHARED_DRAM, SHARED_DRAM_WEIGHTS, SHARED_DRAM_DATA等等。此后,所有的针对DRAM的操作都在此进行。
二、针对DRAM的操作
2.1 权重的写入
现在network.cpp之中,写入了
//in network.cpp fucntion get_network_config
addLayer(net, layer_t("c10/p1", 8, 8, 736, 512, 1, 0, 1, 0, 1, 0, 1));
addLayer(net, layer_t("c10/p2", 8, 8, 736, 512, 1, 0, 1, 0, 0, 1, 1));
net->num_weights = 2528800;
const char* filename = "weights.bin";
loadWeightsFromFile(net, filename);
return net;
先用loatWeightsFromFile将权重读入net结构体之中,也就是ARM的buffer
然后用 copy_weights_to_DRAM(net_CPU);函数将权重写入DRAM之上。
只要运算出DRAM地址,直接运用memcpy函数就可以实现
// in cpu_top.cpp
void copy_weights_to_DRAM(network_t *net_CPU) {
int weightsize = net_CPU->num_weights * sizeof(data_t);
// Info:
printf("CPU: Copy Weights: %dKB (weights)\n", weightsize / 1024);
// Copy Weights:
memcpy((void *)SHARED_DRAM_WEIGHTS, net_CPU->weights, weightsize);
}
2.2 图片的写入
// in main
layer_t input_layer = net_CPU->layers[0];
// Allocate Memory for Input Image:
data_t *input_image = allocate_image_memory(input_layer);
// Load Input Image
load_prepared_input_image(input_layer, input_image, input_filename);
// Copy Input Image into shared DRAM
copy_input_image_to_DRAM(input_layer, input_image);
allocate_image_memory(input_layer);根据输入文档malloc一个image的buffer
// in cpu_top.cpp
data_t *allocate_image_memory(layer_t &layer) {
int win = layer.width;
int hin = layer.height;
int chin = layer.channels_in;
return (data_t *)malloc(win * hin * chin * sizeof(data_t));
}
运用load_prepared_input_image函数将image文件读入buffer
然后运用copy_input_image_to_DRAM将图片拷到DRAM上。依然是memcpy函数
void copy_input_image_to_DRAM(layer_t &layer, data_t *image) {
// Calculate size of input data
int win = layer.width;
int hin = layer.height;
int chin = layer.channels_in;
int num_pixels = win * hin * chin;
int input_size = num_pixels * sizeof(data_t);
printf("CPU: Copy Input Image (%dKB)\n", input_size / 1024);
// Copy Input Data:
memcpy((void *)SHARED_DRAM_DATA, image, input_size);
}
三、FPGA端对于DRAM的操作
相关:ZynqNet解析(四)FPGA端程序解析 https://blog.csdn.net/weixin_36474809/article/details/82683399
3.1 在DRAM与BRAM之间搬运数据
DRAM的地址是cpu通过axi-lite协议或者直接定义于FPGA之上的。
存于BRAM的数据可以用于运算并且加速。***_cache.cpp程序就是基于cache来实现。cache是基于CPU和DRAM之间的一种规模较小速度较高的存储器。FPGA就运用将BRAM作为cahce。
cache存储器 https://baike.baidu.com/item/CACHE%E5%AD%98%E5%82%A8%E5%99%A8/12789048?f...
memory_controller.cpp是用于在BRAM与DRAM之间相互搬运数据的。set系列函数设置相应的偏移地址,然后直接将SHARED_DRAM作为指针地址,然后进行读取与写入。
一个区别reg与直接赋值
//定义于fpga_top.hpp,其如何增加硬件执行效率我们还需再研究
template
#pragma HLS pipeline
#pragma HLS inline self off
#pragma HLS interface ap_ctrl_none register port=return
return x;
}
有的读取使用的是reg函数,而有的运用的是直接读取,有什么区别?读取用reg,写入用指针SHARED_DRAM[dram_weights_offset + layer_weights_offset + addr]?
data_t MemoryController::loadNextWeight(data_t* SHARED_DRAM,
weightaddr_t addr) {
#pragma HLS inline
#pragma HLS pipeline
data_t read = reg(SHARED_DRAM[dram_weights_offset + layer_weights_offset + addr]);
return read;
}
void MemoryController::setPixelLoadRow(coordinate_t y) {
layer_pixel_offset = layer_input_offset + pixels_per_row * y;
}
data_t MemoryController::loadNextChannel(data_t* SHARED_DRAM) {
#pragma HLS inline
#pragma HLS pipeline II=1
data_t pixel_from_ram = reg(SHARED_DRAM[dram_data_offset + layer_pixel_offset]);
layer_pixel_offset++; // increment address for next fetch
return pixel_from_ram;
};
void MemoryController::writeBackOutputChannel(data_t* SHARED_DRAM, channel_t co,
data_t data) {
#pragma HLS inline
LOG_LEVEL_INCR;
SHARED_DRAM[dram_data_offset + pixel_output_offset + co] = data;
}
3.2 cache的实现
设置了IBRAM的数组指针,指向BRAM作为高速cache。
//get pixel out from BRAM
data_t ImageCache::getPixel(const coordinate_t y, const imgcacheaddr_t y_offset,
const coordinate_t x, const channel_t ci) {
#pragma HLS inline
#pragma HLS RESOURCE variable = IBRAM core = RAM_S2P_BRAM
//cacheline_t req_line = (y) % NUM_IMG_CACHE_LINES;
//imgcacheaddr_t addr_line_offset = req_line * line_width;
//#pragma HLS RESOURCE variable=addr_line_offset core=MulnS latency=2
imgcacheaddr_t addr_pixel_offset = x * ch_in;
imgcacheaddr_t addr = y_offset + addr_pixel_offset + ci;
bool is_padding_pixel = x < 0 | x >= width_in | y < 0 | y >= height_in;
data_t px = is_padding_pixel ? 0.0f : IBRAM[addr];
return px;
}
此函数作为从IBRAM之中读出pixel,也是唯一的一处定义了IBRAM的预编译实现实在双端口的BRAM之上。
void ImageCache::setNextChannel(data_t value) {
imgcacheaddr_t MAX_ADDR = (line_width * NUM_IMG_CACHE_LINES - 1);
// Write Value into IBRAM
IBRAM[curr_img_cache_addr] = value;
// Check and Wrap Write Address into IBRAM
if (curr_img_cache_addr == MAX_ADDR)
curr_img_cache_addr = 0;
else
curr_img_cache_addr++;
}
void ImageCache::preloadPixelFromDRAM(data_t *SHARED_DRAM) {
#pragma HLS inline
L_PRELOAD_PIXEL_FROM_DRAM: for (channel_t ci = 0; ci < ch_in; ci++) {
#pragma HLS LOOP_TRIPCOUNT min = 3 max = 1024 avg = 237
#pragma HLS pipeline II = 1
#pragma HLS latency min=4
data_t px = MemoryController::loadNextChannel(SHARED_DRAM);
setNextChannel(px);
}
loads_left = loads_left - ch_in;
}
void ImageCache::preloadRowFromDRAM(data_t *SHARED_DRAM) {
#pragma HLS inline
L_DRAM_PRELOADROW_X: for (coordinate_t x = 0; x < width_in; x++) {
#pragma HLS LOOP_TRIPCOUNT min = 8 max = 256 avg = 45
preloadPixelFromDRAM(SHARED_DRAM);
}
}
这里是将整行的像素值通过memory controller读入到BRAM之上。
preloadRow是按照行循环逐个调用加载pixel权重进行读取。
另外几个cache与此类同。
3.3 并行化
在ProcessingElement之中,将相应的BRAM上的数据传输进来,然后把所有的数据运用pragma的ARRAY_PARTITION指令,可以并行化。此时,也可一运用数据流指令来处理数据了。
void ProcessingElement::processInputChannel(const coordinate_t y,
const coordinate_t x,
const channel_t ci_in,
const channel_t ch_out) {
#pragma HLS inline off
#pragma HLS FUNCTION_INSTANTIATE variable = ci_in
#pragma HLS dataflow
channel_t ci = ci_in;
weightaddr_t ci_offset;
data_t pixel_buffer[9];
#pragma HLS ARRAY_PARTITION variable = pixel_buffer complete dim = 0
// Preload Image Pixel Buffer (fetch pixels around (y,x,ci))
preloadPixelsAndPrecalcCIoffset(y, x, ci, ch_out, ci_offset, pixel_buffer);
// MACC All Output Channels
processAllCHout(ch_out, ci, ci_offset, pixel_buffer);
}
void ProcessingElement::processAllCHout(const channel_t ch_out,
const channel_t ci,
const weightaddr_t ci_offset,
const data_t pixels[9]) {
#pragma HLS INLINE off
L_CH_OUT:
for (channel_t co = 0; co < ch_out; co++) {
#pragma HLS LOOP_TRIPCOUNT min = 16 max = 1024 avg = 258
#pragma HLS unroll factor = N_PE
#pragma HLS PIPELINE II = 1
data_t result, weights_local[9];
#pragma HLS ARRAY_PARTITION variable = weights_local complete dim = 0
// fetch weights
WeightsCache::getNineWeights(co, ci_offset, weights_local);
// multiply-accumulate
macc2d(pixels, weights_local, result);
// save result to Output Buffer
if (ci == 0) {
OutputCache::setChannel(co, result);
} else {
OutputCache::accumulateChannel(co, result);
}
};
}
---------------------
作者:邢翔瑞
来源:CSDN
原文: https://blog.csdn.net/weixin_36474809/article/details/83409853