本文先总结不同AXI IP核的实现的方法,性能的对比,性能差异的分析,可能改进的方面。使用的硬件平台是Zedboard

不同的AXI总线卷积加速模块的概况

这次实现并逐渐优化了三个版本的卷积加速模块,先简要描述各个版本的主要内容。

版本一

版本一主要是用来测试AXI总线IP核的实现可能。

  • 该模块拥有19个32位寄存器
  • 其中前9个寄存器用来保存需要计算的值
  • 后面9个寄存器用来保存卷积核
  • 在读取第19个寄存器的地址的时候计算9个寄存器的卷积和(该计算可以在一个时钟周期内完成)
  • 9个寄存器单独赋值,程序中分别向对应地址写入内容,通过总线进行传输。
  • 故乐观的来算,需要10个总线周期可以获取一个输出

可以从驱动的书写简单理解一下:

void Conv_HW(int filter[3][3], int arr[100][100],
int filterW, int filterH, int arrW, int arrH) {
int i, j;
for (i = 2; i < filterH + arrH - 3; i++) {
for (j = 2; j < filterW + arrW - 3; j++) {
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR, arr[i][j]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+4, arr[i][j - 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+8, arr[i][j - 2]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+12, arr[i - 1][j]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+16, arr[i - 1][j - 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+20, arr[i - 1][j - 2]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+24, arr[i - 2][j]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+28, arr[i - 2][j - 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+32, arr[i - 2][j - 2]);
res[i][j] = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72);
}
if (i % 15 == 0)
printf("=");
}
}

版本一性能

  • 版本一性能最惨,由于没有时间戳,目测软件计算速度远远快于FPGA核心运算速度。
  • 版本一的改进速度就是引入滑动窗口,能够最大程度减少总线周期。

版本二

版本二引入滑动窗口,和初期设计的概念相同。

  • 该模块拥有19个32位寄存器

  • 其中前9个寄存器用来保存需要计算的值

  • 后面9个寄存器用来保存卷积核

  • 在读取第19个寄存器的地址的时候计算9个寄存器的卷积和(该计算可以在一个时钟周期内完成)

  • 三个寄存器滑动赋值,该计算窗口在计算矩阵上滑动 除了冷启动多余两个周期用来预载寄存器,后面的每一个计算只需要四个总线周期

    可以通过写的驱动简单理解一下:

    void Conv_HW(int filter[3][3], int arr[100][100], int arrW, int arrH) {
    int i, j;
    i = 2; j = 2;
    for (i = 2; i < arrH; i++) {
    //pre load
    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j - 1]);
    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j - 1]);
    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j - 1]);
    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j]);
    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j]);
    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j]);
    for (j = 2; j < arrW; j++) {
    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j + 1]);
    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j + 1]);
    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j + 1]);
    res[i][j] = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72);
    }
    }
    }

版本二性能

测试样本 500*50032bit单位的矩阵 计算200次。

软件消耗33.78秒,卷积IP核心40.25

这样的结果还是非常不乐观,分析可能有两种限制了IP核的速度。

  • 两个寄存器的乘法LUT太大,无法硬件优化
  • 总线周期太慢太慢

版本三对于这两种可能进行探索。

版本二的FPGA部分核心代码

    // Implement memory mapped register select and write logic generation
// The write data is accepted and written to memory mapped registers when
// axi_awready, S_AXI_WVALID, axi_wready and S_AXI_WVALID are asserted. Write strobes are used to
// select byte enables of slave registers while writing.
// These registers are cleared when reset (active low) is applied.
// Slave register write enable is asserted when valid address and data are available
// and the slave is ready to accept the write address and write data.
assign slv_reg_wren = axi_wready && S_AXI_WVALID && axi_awready && S_AXI_AWVALID; always @( posedge S_AXI_ACLK )
begin
if ( S_AXI_ARESETN == 1'b0 )
begin
slv_reg0 <= 0;
slv_reg1 <= 0;
slv_reg2 <= 0;
slv_reg3 <= 0;
slv_reg4 <= 0;
slv_reg5 <= 0;
slv_reg6 <= 0;
slv_reg7 <= 0;
slv_reg8 <= 0;
slv_reg9 <= 0;
slv_reg10 <= 0;
slv_reg11 <= 0;
slv_reg12 <= 0;
slv_reg13 <= 0;
slv_reg14 <= 0;
slv_reg15 <= 0;
slv_reg16 <= 0;
slv_reg17 <= 0;
// slv_reg18 <= 0;
end
else begin
if (slv_reg_wren)
begin
case ( axi_awaddr[ADDR_LSB+OPT_MEM_ADDR_BITS:ADDR_LSB] )
5'h00:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 0
slv_reg0[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
5'h01:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 1
slv_reg1[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
5'h02:
begin
slv_reg0 <= slv_reg1;
slv_reg1 <= slv_reg2;
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 2
slv_reg2[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
end
5'h03:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 3
slv_reg3[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
5'h04:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 4
slv_reg4[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
5'h05:
begin
slv_reg3 <= slv_reg4;
slv_reg4 <= slv_reg5;
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 5
slv_reg5[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
end
5'h06:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 6
slv_reg6[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
5'h07:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 7
slv_reg7[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
5'h08:
begin
slv_reg6 <= slv_reg7;
slv_reg7 <= slv_reg8;
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 8
slv_reg8[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
end
5'h09:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 9
slv_reg9[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
5'h0A:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 10
slv_reg10[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
5'h0B:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 11
slv_reg11[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
5'h0C:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 12
slv_reg12[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
5'h0D:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 13
slv_reg13[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
5'h0E:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 14
slv_reg14[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
5'h0F:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 15
slv_reg15[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
5'h10:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 16
slv_reg16[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
5'h11:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 17
slv_reg17[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
// 5'h12:
// for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
// if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// // Respective byte enables are asserted as per write strobes
// // Slave register 18
// slv_reg18[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
// end
default : begin
slv_reg0 <= slv_reg0;
slv_reg1 <= slv_reg1;
slv_reg2 <= slv_reg2;
slv_reg3 <= slv_reg3;
slv_reg4 <= slv_reg4;
slv_reg5 <= slv_reg5;
slv_reg6 <= slv_reg6;
slv_reg7 <= slv_reg7;
slv_reg8 <= slv_reg8;
slv_reg9 <= slv_reg9;
slv_reg10 <= slv_reg10;
slv_reg11 <= slv_reg11;
slv_reg12 <= slv_reg12;
slv_reg13 <= slv_reg13;
slv_reg14 <= slv_reg14;
slv_reg15 <= slv_reg15;
slv_reg16 <= slv_reg16;
slv_reg17 <= slv_reg17;
end
endcase
end
end
end // Implement memory mapped register select and read logic generation
// Slave register read enable is asserted when valid address is available
// and the slave is ready to accept the read address.
assign slv_reg_rden = axi_arready & S_AXI_ARVALID & ~axi_rvalid;
always @(*)
begin
// Address decoding for reading registers
case ( axi_araddr[ADDR_LSB+OPT_MEM_ADDR_BITS:ADDR_LSB] )
5'h00 : reg_data_out <= slv_reg0;
5'h01 : reg_data_out <= slv_reg1;
5'h02 : reg_data_out <= slv_reg2;
5'h03 : reg_data_out <= slv_reg3;
5'h04 : reg_data_out <= slv_reg4;
5'h05 : reg_data_out <= slv_reg5;
5'h06 : reg_data_out <= slv_reg6;
5'h07 : reg_data_out <= slv_reg7;
5'h08 : reg_data_out <= slv_reg8;
5'h09 : reg_data_out <= slv_reg9;
5'h0A : reg_data_out <= slv_reg10;
5'h0B : reg_data_out <= slv_reg11;
5'h0C : reg_data_out <= slv_reg12;
5'h0D : reg_data_out <= slv_reg13;
5'h0E : reg_data_out <= slv_reg14;
5'h0F : reg_data_out <= slv_reg15;
5'h10 : reg_data_out <= slv_reg16;
5'h11 : reg_data_out <= slv_reg17;
5'h12 : reg_data_out <= slv_reg0 * slv_reg9 +
slv_reg1 * slv_reg10 +
slv_reg2 * slv_reg11 +
slv_reg3 * slv_reg12 +
slv_reg4 * slv_reg13 +
slv_reg5 * slv_reg14 +
slv_reg6 * slv_reg15 +
slv_reg7 * slv_reg16 +
slv_reg8 * slv_reg17;
default : reg_data_out <= 0;
endcase
end

版本三

先尝试生成更小的LUT

  • 该模块拥有19个32位寄存器
  • 其中前9个寄存器用来保存需要计算的值
  • 卷积核固定在Verilog中,用来生成更小的LUT
  • 一个计算只需要四个总线周期

性能测试

仍然软件消耗33秒,卷积IP核心40

基本否决是LUT问题。

下面测试AXI总线问题:

假设所有数据均来自于FPGA,无需从总线写入:

void Conv_HW(int filter[3][3], int arr[100][100], int arrW, int arrH) {
int i, j;
i = 2; j = 2;
for (i = 2; i < arrH; i++) {
for (j = 2; j < arrW; j++) {
res[i][j] = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72);
}
}
}

只需要9.47秒即可完成计算,并传回CPU !!!

总结

至此,基本上可以否决利用AXI传数据的可能,所有需要利用AXI总线传输数据的模块均会被总线周期所连累,在优化了传输后,仍然无法解决该问题。确实需要一个更快的方式来传输数据。

AlteraNIOS2中,直接利用IO口传输数据,无需总线周期,再因为NIOS II内核没有流水线优化,所以硬件确实比较快。

附1:AXI4 总线的 FPGA 接口部分

先看总线接口:

        // Users to add ports here

        // User ports ends
// Do not modify the ports beyond this line // Global Clock Signal
// 全局时钟
input wire S_AXI_ACLK,
// Global Reset Signal. This Signal is Active LOW
// 全局复位信号
input wire S_AXI_ARESETN,
// Write address (issued by master, acceped by Slave)
// 写地址
input wire [C_S_AXI_ADDR_WIDTH-1 : 0] S_AXI_AWADDR, // 写地址的保护模式 包括privilege和security level
// Write channel Protection type. This signal indicates the
// privilege and security level of the transaction, and whether
// the transaction is a data access or an instruction access.
input wire [2 : 0] S_AXI_AWPROT, // 写地址有效信号。为高指示地址有效。
// Write address valid. This signal indicates that the master signaling
// valid write address and control information.
input wire S_AXI_AWVALID, // 写地址准备信号。为高表示从设备空闲,准备接收地址;为低表示从设备忙。
// ********** 注意 这里是地址 下面是数据 ********
// Write address ready. This signal indicates that the slave is ready
// to accept an address and associated control signals.
output wire S_AXI_AWREADY, // 写数据,32位到1024位宽
// 从主设备来的数据 从设备接收
// Write data (issued by master, acceped by Slave)
input wire [C_S_AXI_DATA_WIDTH-1 : 0] S_AXI_WDATA, // 写字节选通,用于表示更新存储器的字节通道,对于数据总线的每8位数据有一位写选通信号。
// Write strobes. This signal indicates which byte lanes hold
// valid data. There is one write strobe bit for each eight
// bits of the write data bus.
input wire [(C_S_AXI_DATA_WIDTH/8)-1 : 0] S_AXI_WSTRB, // 写有效。为高指示数据有效。
// Write valid. This signal indicates that valid write
// data and strobes are available.
input wire S_AXI_WVALID, // 写准备。为高表示从设备空闲,准备接收数据;为低表示从设备忙。
// Write ready. This signal indicates that the slave
// can accept the write data.
output wire S_AXI_WREADY, // 写响应。该信号表示写状态,可允许相应的表示为OKAY\EXOKAY\SLVERR\DECERR。
// Write response. This signal indicates the status
// of the write transaction.
output wire [1 : 0] S_AXI_BRESP, // 写响应有效。为高指示响应数据有效
// Write response valid. This signal indicates that the channel
// is signaling a valid write response.
output wire S_AXI_BVALID, // 写响应准备。为高表示主设备空闲,准备接收写响应;为低表示主设备忙。
// Response ready. This signal indicates that the master
// can accept a write response.
input wire S_AXI_BREADY, //
// 读地址。读地址给出突发数据传输的第一个传输地址。
// Read address (issued by master, acceped by Slave)
input wire [C_S_AXI_ADDR_WIDTH-1 : 0] S_AXI_ARADDR, // 保护类型,建议值为000。
// Protection type. This signal indicates the privilege
// and security level of the transaction, and whether the
// transaction is a data access or an instruction access.
input wire [2 : 0] S_AXI_ARPROT, //
// Read address valid. This signal indicates that the channel
// is signaling valid read address and control information.
input wire S_AXI_ARVALID, // 读地址准备信号。为高表示从设备空闲,准备接收地址;为低表示从设备忙。
// Read address ready. This signal indicates that the slave is
// ready to accept an address and associated control signals.
output wire S_AXI_ARREADY, // Read data (issued by slave)
output wire [C_S_AXI_DATA_WIDTH-1 : 0] S_AXI_RDATA,
// Read response. This signal indicates the status of the
// read transfer.
output wire [1 : 0] S_AXI_RRESP,
// Read valid. This signal indicates that the channel is
// signaling the required read data.
output wire S_AXI_RVALID,
// Read ready. This signal indicates that the master can
// accept the read data and response information.
input wire S_AXI_RREADY
); // AXI4LITE signals
reg [C_S_AXI_ADDR_WIDTH-1 : 0] axi_awaddr;
reg axi_awready;
reg axi_wready;
reg [1 : 0] axi_bresp;
reg axi_bvalid;
reg [C_S_AXI_ADDR_WIDTH-1 : 0] axi_araddr;
reg axi_arready;
reg [C_S_AXI_DATA_WIDTH-1 : 0] axi_rdata;
reg [1 : 0] axi_rresp;
reg axi_rvalid;

其中最为重要的读取总线信号寻址的部分:

assign slv_reg_wren = axi_wready && S_AXI_WVALID && axi_awready && S_AXI_AWVALID;

    always @( posedge S_AXI_ACLK )
begin
if ( S_AXI_ARESETN == 1'b0 )
begin
slv_reg0 <= 0;
slv_reg1 <= 0;
slv_reg2 <= 0;
slv_reg3 <= 0;
slv_reg4 <= 0;
slv_reg5 <= 0;
slv_reg6 <= 0;
slv_reg7 <= 0;
slv_reg8 <= 0;
slv_reg9 <= 0;
end
else begin
if (slv_reg_wren)
begin
// 进行寻址
// 地址寻址 是这么玩的
// 当寄存器是32位的 最后就是 2位 4个Byte ADDR_LSB = 2
// 当寄存器是64位的 最后就是 3位 8个Byte ADDR_LSB = 3
// OPT_MEM_ADDR_BITS 用来寻址寄存器 这里选了十个寄存器 所以这里就是4位
case ( axi_awaddr[ADDR_LSB+OPT_MEM_ADDR_BITS:ADDR_LSB] )
4'h0:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
// 只有在对应的Bit位置为1的时候才能开始读取
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 0
slv_reg0[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
4'h1:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 1
slv_reg1[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
4'h2:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 2
slv_reg2[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
4'h3:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 3
slv_reg3[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
4'h4:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 4
slv_reg4[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
4'h5:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 5
slv_reg5[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
4'h6:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 6
slv_reg6[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
4'h7:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 7
slv_reg7[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
4'h8:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 8
slv_reg8[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
4'h9:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 9
slv_reg9[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
default : begin
slv_reg0 <= slv_reg0;
slv_reg1 <= slv_reg1;
slv_reg2 <= slv_reg2;
slv_reg3 <= slv_reg3;
slv_reg4 <= slv_reg4;
slv_reg5 <= slv_reg5;
slv_reg6 <= slv_reg6;
slv_reg7 <= slv_reg7;
slv_reg8 <= slv_reg8;
slv_reg9 <= slv_reg9;
end
endcase
end
end
end

附2:AXI4的测试模块与仿真测试

`timescale 1ns/1ns
module conv_axi_test();
parameter integer C_S00_AXI_DATA_WIDTH = 32;
parameter integer C_S00_AXI_ADDR_WIDTH = 6;
reg s00_axi_aclk;
// 全局复位信号
reg s00_axi_aresetn;
reg [C_S00_AXI_ADDR_WIDTH-1 : 0] s00_axi_awaddr;
wire [2 : 0] s00_axi_awprot;
reg s00_axi_awvalid;
wire s00_axi_awready;
reg [C_S00_AXI_DATA_WIDTH-1 : 0] s00_axi_wdata;
reg [(C_S00_AXI_DATA_WIDTH/8)-1 : 0] s00_axi_wstrb;
reg s00_axi_wvalid;
wire s00_axi_wready;
wire [1 : 0] s00_axi_bresp;
wire s00_axi_bvalid;
wire s00_axi_bready;
reg [C_S00_AXI_ADDR_WIDTH-1 : 0] s00_axi_araddr;
wire [2 : 0] s00_axi_arprot;
reg s00_axi_arvalid;
wire s00_axi_arready;
wire [C_S00_AXI_DATA_WIDTH-1 : 0] s00_axi_rdata;
wire [1 : 0] s00_axi_rresp;
wire s00_axi_rvalid;
wire s00_axi_rready; conv_v1_0_S00_AXI # (
.C_S_AXI_DATA_WIDTH(C_S00_AXI_DATA_WIDTH),
.C_S_AXI_ADDR_WIDTH(C_S00_AXI_ADDR_WIDTH)
) conv_v1_0_S00_AXI_inst (
.S_AXI_ACLK(s00_axi_aclk),
.S_AXI_ARESETN(s00_axi_aresetn),
.S_AXI_AWADDR(s00_axi_awaddr),
.S_AXI_AWPROT(s00_axi_awprot),
.S_AXI_AWVALID(s00_axi_awvalid),
.S_AXI_AWREADY(s00_axi_awready),
.S_AXI_WDATA(s00_axi_wdata),
.S_AXI_WSTRB(s00_axi_wstrb),
.S_AXI_WVALID(s00_axi_wvalid),
.S_AXI_WREADY(s00_axi_wready),
.S_AXI_BRESP(s00_axi_bresp),
.S_AXI_BVALID(s00_axi_bvalid),
.S_AXI_BREADY(s00_axi_bready),
.S_AXI_ARADDR(s00_axi_araddr),
.S_AXI_ARPROT(s00_axi_arprot),
.S_AXI_ARVALID(s00_axi_arvalid),
.S_AXI_ARREADY(s00_axi_arready),
.S_AXI_RDATA(s00_axi_rdata),
.S_AXI_RRESP(s00_axi_rresp),
.S_AXI_RVALID(s00_axi_rvalid),
.S_AXI_RREADY(s00_axi_rready)
); initial
begin:d
integer i;
s00_axi_aclk = 1;
for(i = 0; i< 1000;i++)
begin
#1 s00_axi_aclk = ~ s00_axi_aclk;
end
$finish();
end initial
begin
s00_axi_aresetn = 0;
s00_axi_arvalid = 0;
#4 s00_axi_aresetn = 1;
s00_axi_awvalid = 1;
s00_axi_wvalid = 1;
s00_axi_awaddr = 0;
s00_axi_wstrb = 4'b1111;
s00_axi_wdata = 3;
#4 s00_axi_awaddr = 6'b000100;
s00_axi_wdata = 21;
#4 s00_axi_awaddr = 6'b001000;
s00_axi_wdata = 19;
#4 s00_axi_awaddr = 6'b001100;
s00_axi_wdata = 22;
#4 s00_axi_awaddr = 6'b010000;
s00_axi_wdata = 20;
#4 s00_axi_awaddr = 6'b010100;
s00_axi_wdata = 13;
#4 s00_axi_awaddr = 6'b011000;
s00_axi_wdata = 16;
#4 s00_axi_awaddr = 6'b011100;
s00_axi_wdata = 14;
#4 s00_axi_awaddr = 6'b100000;
s00_axi_wdata = 7; #4
s00_axi_arvalid = 1;
s00_axi_araddr = 6'b100100; end initial
begin
$dumpfile("test.vcd");
$dumpvars();
end
endmodule

利用iverilog进行仿真GTKwave显示测试波形如下

新建IP核如下:

工程顶层图如下:

附3:软件驱动

#include <stdio.h>
#include "platform.h"
#include "xbasic_types.h"
#include "xparameters.h"
#include "xil_io.h"
#define test_speed int res[1000][1000]; void delay() {
int i, j, k;
for (i = 0; i < 1000; i++) {
for (j = 0; j < 1000; j++) {
for (k = 0; k < 100; k++)
;
}
}
} void show_reg() {
int i;
u32 result;
printf("\n============SHOW REG ================\n");
for (i = 0; i < 9; i++) {
result = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 4 * i);
printf("Reg %3d : %u\n", i, result);
}
} void load_kernel(int filter[3][3]) {
UINTPTR kernel_addr = (UINTPTR) XPAR_CONV_0_S00_AXI_BASEADDR + 36;
Xil_Out32(kernel_addr, filter[0][0]);
kernel_addr = kernel_addr + 0x4;
Xil_Out32(kernel_addr, filter[0][1]);
kernel_addr = kernel_addr + 0x4;
Xil_Out32(kernel_addr, filter[0][2]);
kernel_addr = kernel_addr + 0x4;
Xil_Out32(kernel_addr, filter[1][0]);
kernel_addr = kernel_addr + 0x4;
Xil_Out32(kernel_addr, filter[1][1]);
kernel_addr = kernel_addr + 0x4;
Xil_Out32(kernel_addr, filter[1][2]);
kernel_addr = kernel_addr + 0x4;
Xil_Out32(kernel_addr, filter[2][0]);
kernel_addr = kernel_addr + 0x4;
Xil_Out32(kernel_addr, filter[2][1]);
kernel_addr = kernel_addr + 0x4;
Xil_Out32(kernel_addr, filter[2][2]);
} void test_set() {
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, 3);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, 22);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, 16);
printf("1\n");
show_reg(); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, 21);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, 20);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, 14);
printf("2\n");
show_reg(); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, 19);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, 13);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, 7);
printf("3\n");
show_reg();
} void Conv_SW(int filter[3][3], int arr[100][100], int arrW, int arrH) {
int i, j;
i = 2; j = 2;
for (i = 2; i < arrH; i++) {
for (j = 2; j < arrW;j++){
res[i][j] = 0;
res[i][j] += filter[0][0] * arr[i - 1][j - 1];
res[i][j] += filter[0][1] * arr[i - 1][j];
res[i][j] += filter[0][2] * arr[i - 1][j + 1];
res[i][j] += filter[1][0] * arr[i][j - 1];
res[i][j] += filter[1][1] * arr[i][j];
res[i][j] += filter[1][2] * arr[i][j + 1];
res[i][j] += filter[2][0] * arr[i + 1][j - 1];
res[i][j] += filter[2][1] * arr[i + 1][j];
res[i][j] += filter[2][2] * arr[i + 1][j + 1];
}
}
} void Conv_HW(int filter[3][3], int arr[100][100], int arrW, int arrH) {
int i, j;
i = 2; j = 2;
for (i = 2; i < arrH; i++) {
//pre load
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j - 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j - 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j - 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j]);
for (j = 2; j < arrW; j++) {
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j + 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j + 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j + 1]);
res[i][j] = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72);
}
}
} int main() {
printf("HELLO WORLD");
u32 result;
int filterW = 3;
int filterH = 3;
int arrW = 5;
int arrH = 5;
int resW = filterW + arrW - 1;
int resH = filterH + arrH - 1;
int i, j;
int pFilter[3][3];
int arr[100][100];
UINTPTR cur_addr = (UINTPTR) XPAR_CONV_0_S00_AXI_BASEADDR; pFilter[0][0] = 1;
pFilter[0][1] = 3;
pFilter[0][2] = 1;
pFilter[1][0] = 0;
pFilter[1][1] = 5;
pFilter[1][2] = 0;
pFilter[2][0] = 2;
pFilter[2][1] = 1;
pFilter[2][2] = 2; init_platform();
for (i = 0; i < 9; i++) {
Xil_Out32(cur_addr, 0);
cur_addr = cur_addr + 4;
}
load_kernel(pFilter);
printf("Kernel Loaded\n"); #ifdef test_single
test_set();
result = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72);
printf("Test Set Result %u", result);
show_reg();
#endif
#ifdef test_func
srand(10);
arrW = 20;
arrH = 20;
resH = filterH + arrH - 1;
resW = filterW + arrW - 1;
for (i = 0; i < arrH; i++) {
for (j = 0; j < arrW; j++) {
arr[i][j] = rand() % 20;
}
}
printf("*********************************************** \n");
printf("Filter: \n");
for (i = filterH - 1; i >= 0; i--) {
for (j = filterW - 1; j >= 0; j--) {
printf("%d ", pFilter[i][j]);
}
printf("\n");
}
printf("*********************************************** \n"); printf("Matrix: \n");
for (i = 0; i < arrH; i++) {
for (j = 0; j < arrW; j++) {
printf("%4d ", arr[i][j]);
}
printf("\n");
}
printf("*********************************************** \n");
printf("Software Start!\n");
Conv_SW(pFilter, arr, arrW, arrH);
printf("\nSoftware end!\n"); printf("*********************************************** \n");
printf("Result1: \n");
for (i = 0; i < resH; i++) {
for (j = 0; j < resW; j++) {
printf("%5d ", res[i][j]);
}
printf("\n");
} for (i = 0; i < resH; i++) {
for (j = 0; j < resW; j++) {
res[i][j] = 0;
}
}
printf("*********************************************** \n");
printf("HardWare Start!\n");
Conv_HW(pFilter, arr, arrW, arrH);
printf("\nHardWare end!");
printf("Result2: \n");
for (i = 0; i < resH; i++) {
for (j = 0; j < resW; j++) {
printf("%5d ", res[i][j]);
}
printf("\n");
}
printf("*********************************************** \n");
#endif
#ifdef test_speed
arrW = 500;
arrH = 500;
resH = filterH + arrH - 1;
resW = filterW + arrW - 1;
printf("Software Start!\n"); for(i = 0; i< 200;i++) {
Conv_SW(pFilter, arr, arrW, arrH);
}
printf("\nSoftware end!\n");
printf("HardWare Start!\n");
for(i = 0; i< 200;i++) {
Conv_HW(pFilter, arr, arrW, arrH);
}
printf("\nHardWare end!");
cleanup_platform();
#endif
return 0;
}

基于AXI4总线卷积FPGA加速IP核的尝试的更多相关文章

  1. 基于AMBA总线的SPI协议IP核的设计与验证

    https://wenku.baidu.com/view/9542213131126edb6f1a1048.html?mark_pay_doc=2&mark_rec_page=1&ma ...

  2. 自定义AXI总线形式SPI接口IP核,点亮OLED

    一.前言 最近花费很多精力在算法仿真和实现上,外设接口的调试略有生疏.本文以FPGA控制OLED中的SPI接口为例,重新夯实下基础.重点内容为SPI时序的RTL设计以及AXI-Lite总线分析.当然做 ...

  3. FPGA内部IP核DDS

    项目当中需要正弦信号与余弦信号,首先想到了DDS芯片,例如AD9833.AD9834.由于还需要用FPGA   做一些数据处理,后来干脆直接用FPGA 内部的DDSIP核,同时根据IP核内部的相位累加 ...

  4. 一步一步学ZedBoard & Zynq(四):基于AXI Lite 总线的从设备IP设计

    本帖最后由 xinxincaijq 于 2013-1-9 10:27 编辑 一步一步学ZedBoard & Zynq(四):基于AXI Lite 总线的从设备IP设计 转自博客:http:// ...

  5. AXI-Lite总线及其自定义IP核使用分析总结

    ZYNQ的优势在于通过高效的接口总线组成了ARM+FPGA的架构.我认为两者是互为底层的,当进行算法验证时,ARM端现有的硬件控制器和库函数可以很方便地连接外设,而不像FPGA设计那样完全写出接口时序 ...

  6. 基于MIG IP核的DDR3控制器(一)

    最近学习了DDR3控制器的使用,也用着DDR完成了一些简单工作,想着以后一段可能只用封装过后的IP核,可能会忘记DDR3控制器的一些内容,想着把这个DDR控制器的编写过程记录下来,便于我自己以后查看吧 ...

  7. 每天进步一点点------SOPC的Avalon-MM IP核(二) AVALON总线的IP核定制

    简介 NIOS II是一个建立在FPGA上的嵌入式软核处理器,除了可以根据需要任意添加已经提供的外设外,用户还可以通过定制用户逻辑外设和定制用户指令来实现各种应用要求.这节我们就来研究如何定制基于Av ...

  8. Vivado设计二:zynq的PS访问PL中的自带IP核(基于zybo)

    1.建立工程 首先和Vivado设计一中一样,先建立工程(这部分就忽略了) 2.create block design 同样,Add IP 同样,也添加配置文件,这些都和设计一是一样的,没什么区别. ...

  9. 【6集iCore3_ADP触摸屏驱动讲解视频】6-2 基于FSMC总线的ARM与FPGA通信

    视频简介: 该视频介绍基于FSMC总线的ARM与FPGA通信   源视频包下载地址: 链接:http://pan.baidu.com/s/1slJDoQD 密码:tmw7   银杏科技优酷视频发布区: ...

随机推荐

  1. 消息队列中间件 RocketMQ 源码分析 —— Message 存储

  2. 用u盘启动计算机

    上次只是做好了u盘启动盘,但是并没有说怎么安装系统.接下来说一下怎么装系统.链接:怎么把系统装进u盘(ultraiso) 电脑经常要用到u盘启动.设置u盘启动在bios设置里面进行设置.下面就来讲解一 ...

  3. 写一个Vue loading 插件

    什么是vue插件? 从功能上说,插件是为Vue添加全局功能的一种机制,比如给Vue添加一个全局组件,全局指令等: 从代码结构上说,插件就是一个必须拥有install方法的对象,这个方法的接收的第一个参 ...

  4. [js高手之路] vue系列教程 - 实现留言板todolist(3)

    通过前面两篇文章的的学习,我们掌握了vue的基本用法. 本文,就利用这些基础知识来实现一个留言板, 老外把他称之为todolist. 第一步.使用bootstrap做好布局 <!DOCTYPE ...

  5. 关于java中用itext导出word的一点想法

    这几天在项目组只做了很少的事情,主要还是自己不认真地说.我的部分是要负责用itext导出word文档这一块,之前看到大佬们做出了EXCEL部分觉得很是惊奇,就像刚刚接触HTML一样的感觉.但是毕竟自己 ...

  6. Go指针

    Go 语言指针 Go 语言中指针是很容易学习的,Go 语言中使用指针可以更简单的执行一些任务. 接下来让我们来一步步学习 Go 语言指针. 我们都知道,变量是一种使用方便的占位符,用于引用计算机内存地 ...

  7. 一个基于JRTPLIB的轻量级RTSP客户端(myRTSPClient)——解码篇:(一)用ffmpeg解码视频

    一.概述 myRTSPClient(RTSPClient)获取音视频数据之后,接下来的工作便是将音视频数据交给解码器去解码(ffmpeg),ffmpeg解码之后于是便有了呈现在终端用户(USER)面前 ...

  8. Android开发之SoundPool使用详解

    使用SoundPool播放音效 如果应用程序经常播放密集.急促而又短暂的音效(如游戏音效)那么使用MediaPlayer显得有些不太适合了.因为MediaPlayer存在如下缺点: 1) 延时时间较长 ...

  9. Andrew Ng机器学习课程笔记--week10(优化梯度下降)

    本周主要介绍了梯度下降算法运用到大数据时的优化方法. 一.内容概要 Gradient Descent with Large Datasets Stochastic Gradient Descent M ...

  10. 关于变量 Objects...objects 和Object[] objects的区别

    上一篇用到Objects...objects 和Object[] objects的遇到点小问题,于是我去做了个实验,关于这两个变量传参的问题 代码如下 package com.yck.test; pu ...