本文先总结不同AXI IP核的实现的方法,性能的对比,性能差异的分析,可能改进的方面。使用的硬件平台是Zedboard

不同的AXI总线卷积加速模块的概况

这次实现并逐渐优化了三个版本的卷积加速模块,先简要描述各个版本的主要内容。

版本一

版本一主要是用来测试AXI总线IP核的实现可能。

  • 该模块拥有19个32位寄存器
  • 其中前9个寄存器用来保存需要计算的值
  • 后面9个寄存器用来保存卷积核
  • 在读取第19个寄存器的地址的时候计算9个寄存器的卷积和(该计算可以在一个时钟周期内完成)
  • 9个寄存器单独赋值,程序中分别向对应地址写入内容,通过总线进行传输。
  • 故乐观的来算,需要10个总线周期可以获取一个输出

可以从驱动的书写简单理解一下:

void Conv_HW(int filter[3][3], int arr[100][100],
int filterW, int filterH, int arrW, int arrH) {
int i, j;
for (i = 2; i < filterH + arrH - 3; i++) {
for (j = 2; j < filterW + arrW - 3; j++) {
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR, arr[i][j]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+4, arr[i][j - 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+8, arr[i][j - 2]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+12, arr[i - 1][j]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+16, arr[i - 1][j - 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+20, arr[i - 1][j - 2]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+24, arr[i - 2][j]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+28, arr[i - 2][j - 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR+32, arr[i - 2][j - 2]);
res[i][j] = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72);
}
if (i % 15 == 0)
printf("=");
}
}

版本一性能

  • 版本一性能最惨,由于没有时间戳,目测软件计算速度远远快于FPGA核心运算速度。
  • 版本一的改进速度就是引入滑动窗口,能够最大程度减少总线周期。

版本二

版本二引入滑动窗口,和初期设计的概念相同。

  • 该模块拥有19个32位寄存器

  • 其中前9个寄存器用来保存需要计算的值

  • 后面9个寄存器用来保存卷积核

  • 在读取第19个寄存器的地址的时候计算9个寄存器的卷积和(该计算可以在一个时钟周期内完成)

  • 三个寄存器滑动赋值,该计算窗口在计算矩阵上滑动 除了冷启动多余两个周期用来预载寄存器,后面的每一个计算只需要四个总线周期

    可以通过写的驱动简单理解一下:

    void Conv_HW(int filter[3][3], int arr[100][100], int arrW, int arrH) {
    int i, j;
    i = 2; j = 2;
    for (i = 2; i < arrH; i++) {
    //pre load
    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j - 1]);
    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j - 1]);
    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j - 1]);
    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j]);
    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j]);
    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j]);
    for (j = 2; j < arrW; j++) {
    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j + 1]);
    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j + 1]);
    Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j + 1]);
    res[i][j] = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72);
    }
    }
    }

版本二性能

测试样本 500*50032bit单位的矩阵 计算200次。

软件消耗33.78秒,卷积IP核心40.25

这样的结果还是非常不乐观,分析可能有两种限制了IP核的速度。

  • 两个寄存器的乘法LUT太大,无法硬件优化
  • 总线周期太慢太慢

版本三对于这两种可能进行探索。

版本二的FPGA部分核心代码

    // Implement memory mapped register select and write logic generation
// The write data is accepted and written to memory mapped registers when
// axi_awready, S_AXI_WVALID, axi_wready and S_AXI_WVALID are asserted. Write strobes are used to
// select byte enables of slave registers while writing.
// These registers are cleared when reset (active low) is applied.
// Slave register write enable is asserted when valid address and data are available
// and the slave is ready to accept the write address and write data.
assign slv_reg_wren = axi_wready && S_AXI_WVALID && axi_awready && S_AXI_AWVALID; always @( posedge S_AXI_ACLK )
begin
if ( S_AXI_ARESETN == 1'b0 )
begin
slv_reg0 <= 0;
slv_reg1 <= 0;
slv_reg2 <= 0;
slv_reg3 <= 0;
slv_reg4 <= 0;
slv_reg5 <= 0;
slv_reg6 <= 0;
slv_reg7 <= 0;
slv_reg8 <= 0;
slv_reg9 <= 0;
slv_reg10 <= 0;
slv_reg11 <= 0;
slv_reg12 <= 0;
slv_reg13 <= 0;
slv_reg14 <= 0;
slv_reg15 <= 0;
slv_reg16 <= 0;
slv_reg17 <= 0;
// slv_reg18 <= 0;
end
else begin
if (slv_reg_wren)
begin
case ( axi_awaddr[ADDR_LSB+OPT_MEM_ADDR_BITS:ADDR_LSB] )
5'h00:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 0
slv_reg0[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
5'h01:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 1
slv_reg1[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
5'h02:
begin
slv_reg0 <= slv_reg1;
slv_reg1 <= slv_reg2;
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 2
slv_reg2[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
end
5'h03:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 3
slv_reg3[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
5'h04:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 4
slv_reg4[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
5'h05:
begin
slv_reg3 <= slv_reg4;
slv_reg4 <= slv_reg5;
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 5
slv_reg5[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
end
5'h06:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 6
slv_reg6[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
5'h07:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 7
slv_reg7[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
5'h08:
begin
slv_reg6 <= slv_reg7;
slv_reg7 <= slv_reg8;
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 8
slv_reg8[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
end
5'h09:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 9
slv_reg9[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
5'h0A:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 10
slv_reg10[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
5'h0B:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 11
slv_reg11[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
5'h0C:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 12
slv_reg12[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
5'h0D:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 13
slv_reg13[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
5'h0E:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 14
slv_reg14[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
5'h0F:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 15
slv_reg15[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
5'h10:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 16
slv_reg16[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
5'h11:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 17
slv_reg17[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
// 5'h12:
// for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
// if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// // Respective byte enables are asserted as per write strobes
// // Slave register 18
// slv_reg18[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
// end
default : begin
slv_reg0 <= slv_reg0;
slv_reg1 <= slv_reg1;
slv_reg2 <= slv_reg2;
slv_reg3 <= slv_reg3;
slv_reg4 <= slv_reg4;
slv_reg5 <= slv_reg5;
slv_reg6 <= slv_reg6;
slv_reg7 <= slv_reg7;
slv_reg8 <= slv_reg8;
slv_reg9 <= slv_reg9;
slv_reg10 <= slv_reg10;
slv_reg11 <= slv_reg11;
slv_reg12 <= slv_reg12;
slv_reg13 <= slv_reg13;
slv_reg14 <= slv_reg14;
slv_reg15 <= slv_reg15;
slv_reg16 <= slv_reg16;
slv_reg17 <= slv_reg17;
end
endcase
end
end
end // Implement memory mapped register select and read logic generation
// Slave register read enable is asserted when valid address is available
// and the slave is ready to accept the read address.
assign slv_reg_rden = axi_arready & S_AXI_ARVALID & ~axi_rvalid;
always @(*)
begin
// Address decoding for reading registers
case ( axi_araddr[ADDR_LSB+OPT_MEM_ADDR_BITS:ADDR_LSB] )
5'h00 : reg_data_out <= slv_reg0;
5'h01 : reg_data_out <= slv_reg1;
5'h02 : reg_data_out <= slv_reg2;
5'h03 : reg_data_out <= slv_reg3;
5'h04 : reg_data_out <= slv_reg4;
5'h05 : reg_data_out <= slv_reg5;
5'h06 : reg_data_out <= slv_reg6;
5'h07 : reg_data_out <= slv_reg7;
5'h08 : reg_data_out <= slv_reg8;
5'h09 : reg_data_out <= slv_reg9;
5'h0A : reg_data_out <= slv_reg10;
5'h0B : reg_data_out <= slv_reg11;
5'h0C : reg_data_out <= slv_reg12;
5'h0D : reg_data_out <= slv_reg13;
5'h0E : reg_data_out <= slv_reg14;
5'h0F : reg_data_out <= slv_reg15;
5'h10 : reg_data_out <= slv_reg16;
5'h11 : reg_data_out <= slv_reg17;
5'h12 : reg_data_out <= slv_reg0 * slv_reg9 +
slv_reg1 * slv_reg10 +
slv_reg2 * slv_reg11 +
slv_reg3 * slv_reg12 +
slv_reg4 * slv_reg13 +
slv_reg5 * slv_reg14 +
slv_reg6 * slv_reg15 +
slv_reg7 * slv_reg16 +
slv_reg8 * slv_reg17;
default : reg_data_out <= 0;
endcase
end

版本三

先尝试生成更小的LUT

  • 该模块拥有19个32位寄存器
  • 其中前9个寄存器用来保存需要计算的值
  • 卷积核固定在Verilog中,用来生成更小的LUT
  • 一个计算只需要四个总线周期

性能测试

仍然软件消耗33秒,卷积IP核心40

基本否决是LUT问题。

下面测试AXI总线问题:

假设所有数据均来自于FPGA,无需从总线写入:

void Conv_HW(int filter[3][3], int arr[100][100], int arrW, int arrH) {
int i, j;
i = 2; j = 2;
for (i = 2; i < arrH; i++) {
for (j = 2; j < arrW; j++) {
res[i][j] = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72);
}
}
}

只需要9.47秒即可完成计算,并传回CPU !!!

总结

至此,基本上可以否决利用AXI传数据的可能,所有需要利用AXI总线传输数据的模块均会被总线周期所连累,在优化了传输后,仍然无法解决该问题。确实需要一个更快的方式来传输数据。

AlteraNIOS2中,直接利用IO口传输数据,无需总线周期,再因为NIOS II内核没有流水线优化,所以硬件确实比较快。

附1:AXI4 总线的 FPGA 接口部分

先看总线接口:

        // Users to add ports here

        // User ports ends
// Do not modify the ports beyond this line // Global Clock Signal
// 全局时钟
input wire S_AXI_ACLK,
// Global Reset Signal. This Signal is Active LOW
// 全局复位信号
input wire S_AXI_ARESETN,
// Write address (issued by master, acceped by Slave)
// 写地址
input wire [C_S_AXI_ADDR_WIDTH-1 : 0] S_AXI_AWADDR, // 写地址的保护模式 包括privilege和security level
// Write channel Protection type. This signal indicates the
// privilege and security level of the transaction, and whether
// the transaction is a data access or an instruction access.
input wire [2 : 0] S_AXI_AWPROT, // 写地址有效信号。为高指示地址有效。
// Write address valid. This signal indicates that the master signaling
// valid write address and control information.
input wire S_AXI_AWVALID, // 写地址准备信号。为高表示从设备空闲,准备接收地址;为低表示从设备忙。
// ********** 注意 这里是地址 下面是数据 ********
// Write address ready. This signal indicates that the slave is ready
// to accept an address and associated control signals.
output wire S_AXI_AWREADY, // 写数据,32位到1024位宽
// 从主设备来的数据 从设备接收
// Write data (issued by master, acceped by Slave)
input wire [C_S_AXI_DATA_WIDTH-1 : 0] S_AXI_WDATA, // 写字节选通,用于表示更新存储器的字节通道,对于数据总线的每8位数据有一位写选通信号。
// Write strobes. This signal indicates which byte lanes hold
// valid data. There is one write strobe bit for each eight
// bits of the write data bus.
input wire [(C_S_AXI_DATA_WIDTH/8)-1 : 0] S_AXI_WSTRB, // 写有效。为高指示数据有效。
// Write valid. This signal indicates that valid write
// data and strobes are available.
input wire S_AXI_WVALID, // 写准备。为高表示从设备空闲,准备接收数据;为低表示从设备忙。
// Write ready. This signal indicates that the slave
// can accept the write data.
output wire S_AXI_WREADY, // 写响应。该信号表示写状态,可允许相应的表示为OKAY\EXOKAY\SLVERR\DECERR。
// Write response. This signal indicates the status
// of the write transaction.
output wire [1 : 0] S_AXI_BRESP, // 写响应有效。为高指示响应数据有效
// Write response valid. This signal indicates that the channel
// is signaling a valid write response.
output wire S_AXI_BVALID, // 写响应准备。为高表示主设备空闲,准备接收写响应;为低表示主设备忙。
// Response ready. This signal indicates that the master
// can accept a write response.
input wire S_AXI_BREADY, //
// 读地址。读地址给出突发数据传输的第一个传输地址。
// Read address (issued by master, acceped by Slave)
input wire [C_S_AXI_ADDR_WIDTH-1 : 0] S_AXI_ARADDR, // 保护类型,建议值为000。
// Protection type. This signal indicates the privilege
// and security level of the transaction, and whether the
// transaction is a data access or an instruction access.
input wire [2 : 0] S_AXI_ARPROT, //
// Read address valid. This signal indicates that the channel
// is signaling valid read address and control information.
input wire S_AXI_ARVALID, // 读地址准备信号。为高表示从设备空闲,准备接收地址;为低表示从设备忙。
// Read address ready. This signal indicates that the slave is
// ready to accept an address and associated control signals.
output wire S_AXI_ARREADY, // Read data (issued by slave)
output wire [C_S_AXI_DATA_WIDTH-1 : 0] S_AXI_RDATA,
// Read response. This signal indicates the status of the
// read transfer.
output wire [1 : 0] S_AXI_RRESP,
// Read valid. This signal indicates that the channel is
// signaling the required read data.
output wire S_AXI_RVALID,
// Read ready. This signal indicates that the master can
// accept the read data and response information.
input wire S_AXI_RREADY
); // AXI4LITE signals
reg [C_S_AXI_ADDR_WIDTH-1 : 0] axi_awaddr;
reg axi_awready;
reg axi_wready;
reg [1 : 0] axi_bresp;
reg axi_bvalid;
reg [C_S_AXI_ADDR_WIDTH-1 : 0] axi_araddr;
reg axi_arready;
reg [C_S_AXI_DATA_WIDTH-1 : 0] axi_rdata;
reg [1 : 0] axi_rresp;
reg axi_rvalid;

其中最为重要的读取总线信号寻址的部分:

assign slv_reg_wren = axi_wready && S_AXI_WVALID && axi_awready && S_AXI_AWVALID;

    always @( posedge S_AXI_ACLK )
begin
if ( S_AXI_ARESETN == 1'b0 )
begin
slv_reg0 <= 0;
slv_reg1 <= 0;
slv_reg2 <= 0;
slv_reg3 <= 0;
slv_reg4 <= 0;
slv_reg5 <= 0;
slv_reg6 <= 0;
slv_reg7 <= 0;
slv_reg8 <= 0;
slv_reg9 <= 0;
end
else begin
if (slv_reg_wren)
begin
// 进行寻址
// 地址寻址 是这么玩的
// 当寄存器是32位的 最后就是 2位 4个Byte ADDR_LSB = 2
// 当寄存器是64位的 最后就是 3位 8个Byte ADDR_LSB = 3
// OPT_MEM_ADDR_BITS 用来寻址寄存器 这里选了十个寄存器 所以这里就是4位
case ( axi_awaddr[ADDR_LSB+OPT_MEM_ADDR_BITS:ADDR_LSB] )
4'h0:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
// 只有在对应的Bit位置为1的时候才能开始读取
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 0
slv_reg0[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
4'h1:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 1
slv_reg1[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
4'h2:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 2
slv_reg2[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
4'h3:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 3
slv_reg3[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
4'h4:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 4
slv_reg4[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
4'h5:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 5
slv_reg5[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
4'h6:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 6
slv_reg6[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
4'h7:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 7
slv_reg7[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
4'h8:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 8
slv_reg8[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
4'h9:
for ( byte_index = 0; byte_index <= (C_S_AXI_DATA_WIDTH/8)-1; byte_index = byte_index+1 )
if ( S_AXI_WSTRB[byte_index] == 1 ) begin
// Respective byte enables are asserted as per write strobes
// Slave register 9
slv_reg9[(byte_index*8) +: 8] <= S_AXI_WDATA[(byte_index*8) +: 8];
end
default : begin
slv_reg0 <= slv_reg0;
slv_reg1 <= slv_reg1;
slv_reg2 <= slv_reg2;
slv_reg3 <= slv_reg3;
slv_reg4 <= slv_reg4;
slv_reg5 <= slv_reg5;
slv_reg6 <= slv_reg6;
slv_reg7 <= slv_reg7;
slv_reg8 <= slv_reg8;
slv_reg9 <= slv_reg9;
end
endcase
end
end
end

附2:AXI4的测试模块与仿真测试

`timescale 1ns/1ns
module conv_axi_test();
parameter integer C_S00_AXI_DATA_WIDTH = 32;
parameter integer C_S00_AXI_ADDR_WIDTH = 6;
reg s00_axi_aclk;
// 全局复位信号
reg s00_axi_aresetn;
reg [C_S00_AXI_ADDR_WIDTH-1 : 0] s00_axi_awaddr;
wire [2 : 0] s00_axi_awprot;
reg s00_axi_awvalid;
wire s00_axi_awready;
reg [C_S00_AXI_DATA_WIDTH-1 : 0] s00_axi_wdata;
reg [(C_S00_AXI_DATA_WIDTH/8)-1 : 0] s00_axi_wstrb;
reg s00_axi_wvalid;
wire s00_axi_wready;
wire [1 : 0] s00_axi_bresp;
wire s00_axi_bvalid;
wire s00_axi_bready;
reg [C_S00_AXI_ADDR_WIDTH-1 : 0] s00_axi_araddr;
wire [2 : 0] s00_axi_arprot;
reg s00_axi_arvalid;
wire s00_axi_arready;
wire [C_S00_AXI_DATA_WIDTH-1 : 0] s00_axi_rdata;
wire [1 : 0] s00_axi_rresp;
wire s00_axi_rvalid;
wire s00_axi_rready; conv_v1_0_S00_AXI # (
.C_S_AXI_DATA_WIDTH(C_S00_AXI_DATA_WIDTH),
.C_S_AXI_ADDR_WIDTH(C_S00_AXI_ADDR_WIDTH)
) conv_v1_0_S00_AXI_inst (
.S_AXI_ACLK(s00_axi_aclk),
.S_AXI_ARESETN(s00_axi_aresetn),
.S_AXI_AWADDR(s00_axi_awaddr),
.S_AXI_AWPROT(s00_axi_awprot),
.S_AXI_AWVALID(s00_axi_awvalid),
.S_AXI_AWREADY(s00_axi_awready),
.S_AXI_WDATA(s00_axi_wdata),
.S_AXI_WSTRB(s00_axi_wstrb),
.S_AXI_WVALID(s00_axi_wvalid),
.S_AXI_WREADY(s00_axi_wready),
.S_AXI_BRESP(s00_axi_bresp),
.S_AXI_BVALID(s00_axi_bvalid),
.S_AXI_BREADY(s00_axi_bready),
.S_AXI_ARADDR(s00_axi_araddr),
.S_AXI_ARPROT(s00_axi_arprot),
.S_AXI_ARVALID(s00_axi_arvalid),
.S_AXI_ARREADY(s00_axi_arready),
.S_AXI_RDATA(s00_axi_rdata),
.S_AXI_RRESP(s00_axi_rresp),
.S_AXI_RVALID(s00_axi_rvalid),
.S_AXI_RREADY(s00_axi_rready)
); initial
begin:d
integer i;
s00_axi_aclk = 1;
for(i = 0; i< 1000;i++)
begin
#1 s00_axi_aclk = ~ s00_axi_aclk;
end
$finish();
end initial
begin
s00_axi_aresetn = 0;
s00_axi_arvalid = 0;
#4 s00_axi_aresetn = 1;
s00_axi_awvalid = 1;
s00_axi_wvalid = 1;
s00_axi_awaddr = 0;
s00_axi_wstrb = 4'b1111;
s00_axi_wdata = 3;
#4 s00_axi_awaddr = 6'b000100;
s00_axi_wdata = 21;
#4 s00_axi_awaddr = 6'b001000;
s00_axi_wdata = 19;
#4 s00_axi_awaddr = 6'b001100;
s00_axi_wdata = 22;
#4 s00_axi_awaddr = 6'b010000;
s00_axi_wdata = 20;
#4 s00_axi_awaddr = 6'b010100;
s00_axi_wdata = 13;
#4 s00_axi_awaddr = 6'b011000;
s00_axi_wdata = 16;
#4 s00_axi_awaddr = 6'b011100;
s00_axi_wdata = 14;
#4 s00_axi_awaddr = 6'b100000;
s00_axi_wdata = 7; #4
s00_axi_arvalid = 1;
s00_axi_araddr = 6'b100100; end initial
begin
$dumpfile("test.vcd");
$dumpvars();
end
endmodule

利用iverilog进行仿真GTKwave显示测试波形如下

新建IP核如下:

工程顶层图如下:

附3:软件驱动

#include <stdio.h>
#include "platform.h"
#include "xbasic_types.h"
#include "xparameters.h"
#include "xil_io.h"
#define test_speed int res[1000][1000]; void delay() {
int i, j, k;
for (i = 0; i < 1000; i++) {
for (j = 0; j < 1000; j++) {
for (k = 0; k < 100; k++)
;
}
}
} void show_reg() {
int i;
u32 result;
printf("\n============SHOW REG ================\n");
for (i = 0; i < 9; i++) {
result = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 4 * i);
printf("Reg %3d : %u\n", i, result);
}
} void load_kernel(int filter[3][3]) {
UINTPTR kernel_addr = (UINTPTR) XPAR_CONV_0_S00_AXI_BASEADDR + 36;
Xil_Out32(kernel_addr, filter[0][0]);
kernel_addr = kernel_addr + 0x4;
Xil_Out32(kernel_addr, filter[0][1]);
kernel_addr = kernel_addr + 0x4;
Xil_Out32(kernel_addr, filter[0][2]);
kernel_addr = kernel_addr + 0x4;
Xil_Out32(kernel_addr, filter[1][0]);
kernel_addr = kernel_addr + 0x4;
Xil_Out32(kernel_addr, filter[1][1]);
kernel_addr = kernel_addr + 0x4;
Xil_Out32(kernel_addr, filter[1][2]);
kernel_addr = kernel_addr + 0x4;
Xil_Out32(kernel_addr, filter[2][0]);
kernel_addr = kernel_addr + 0x4;
Xil_Out32(kernel_addr, filter[2][1]);
kernel_addr = kernel_addr + 0x4;
Xil_Out32(kernel_addr, filter[2][2]);
} void test_set() {
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, 3);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, 22);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, 16);
printf("1\n");
show_reg(); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, 21);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, 20);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, 14);
printf("2\n");
show_reg(); Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, 19);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, 13);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, 7);
printf("3\n");
show_reg();
} void Conv_SW(int filter[3][3], int arr[100][100], int arrW, int arrH) {
int i, j;
i = 2; j = 2;
for (i = 2; i < arrH; i++) {
for (j = 2; j < arrW;j++){
res[i][j] = 0;
res[i][j] += filter[0][0] * arr[i - 1][j - 1];
res[i][j] += filter[0][1] * arr[i - 1][j];
res[i][j] += filter[0][2] * arr[i - 1][j + 1];
res[i][j] += filter[1][0] * arr[i][j - 1];
res[i][j] += filter[1][1] * arr[i][j];
res[i][j] += filter[1][2] * arr[i][j + 1];
res[i][j] += filter[2][0] * arr[i + 1][j - 1];
res[i][j] += filter[2][1] * arr[i + 1][j];
res[i][j] += filter[2][2] * arr[i + 1][j + 1];
}
}
} void Conv_HW(int filter[3][3], int arr[100][100], int arrW, int arrH) {
int i, j;
i = 2; j = 2;
for (i = 2; i < arrH; i++) {
//pre load
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j - 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j - 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j - 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j]);
for (j = 2; j < arrW; j++) {
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 8, arr[i - 1][j + 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 20, arr[i][j + 1]);
Xil_Out32(XPAR_CONV_0_S00_AXI_BASEADDR + 32, arr[i + 1][j + 1]);
res[i][j] = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72);
}
}
} int main() {
printf("HELLO WORLD");
u32 result;
int filterW = 3;
int filterH = 3;
int arrW = 5;
int arrH = 5;
int resW = filterW + arrW - 1;
int resH = filterH + arrH - 1;
int i, j;
int pFilter[3][3];
int arr[100][100];
UINTPTR cur_addr = (UINTPTR) XPAR_CONV_0_S00_AXI_BASEADDR; pFilter[0][0] = 1;
pFilter[0][1] = 3;
pFilter[0][2] = 1;
pFilter[1][0] = 0;
pFilter[1][1] = 5;
pFilter[1][2] = 0;
pFilter[2][0] = 2;
pFilter[2][1] = 1;
pFilter[2][2] = 2; init_platform();
for (i = 0; i < 9; i++) {
Xil_Out32(cur_addr, 0);
cur_addr = cur_addr + 4;
}
load_kernel(pFilter);
printf("Kernel Loaded\n"); #ifdef test_single
test_set();
result = Xil_In32(XPAR_CONV_0_S00_AXI_BASEADDR + 72);
printf("Test Set Result %u", result);
show_reg();
#endif
#ifdef test_func
srand(10);
arrW = 20;
arrH = 20;
resH = filterH + arrH - 1;
resW = filterW + arrW - 1;
for (i = 0; i < arrH; i++) {
for (j = 0; j < arrW; j++) {
arr[i][j] = rand() % 20;
}
}
printf("*********************************************** \n");
printf("Filter: \n");
for (i = filterH - 1; i >= 0; i--) {
for (j = filterW - 1; j >= 0; j--) {
printf("%d ", pFilter[i][j]);
}
printf("\n");
}
printf("*********************************************** \n"); printf("Matrix: \n");
for (i = 0; i < arrH; i++) {
for (j = 0; j < arrW; j++) {
printf("%4d ", arr[i][j]);
}
printf("\n");
}
printf("*********************************************** \n");
printf("Software Start!\n");
Conv_SW(pFilter, arr, arrW, arrH);
printf("\nSoftware end!\n"); printf("*********************************************** \n");
printf("Result1: \n");
for (i = 0; i < resH; i++) {
for (j = 0; j < resW; j++) {
printf("%5d ", res[i][j]);
}
printf("\n");
} for (i = 0; i < resH; i++) {
for (j = 0; j < resW; j++) {
res[i][j] = 0;
}
}
printf("*********************************************** \n");
printf("HardWare Start!\n");
Conv_HW(pFilter, arr, arrW, arrH);
printf("\nHardWare end!");
printf("Result2: \n");
for (i = 0; i < resH; i++) {
for (j = 0; j < resW; j++) {
printf("%5d ", res[i][j]);
}
printf("\n");
}
printf("*********************************************** \n");
#endif
#ifdef test_speed
arrW = 500;
arrH = 500;
resH = filterH + arrH - 1;
resW = filterW + arrW - 1;
printf("Software Start!\n"); for(i = 0; i< 200;i++) {
Conv_SW(pFilter, arr, arrW, arrH);
}
printf("\nSoftware end!\n");
printf("HardWare Start!\n");
for(i = 0; i< 200;i++) {
Conv_HW(pFilter, arr, arrW, arrH);
}
printf("\nHardWare end!");
cleanup_platform();
#endif
return 0;
}

基于AXI4总线卷积FPGA加速IP核的尝试的更多相关文章

  1. 基于AMBA总线的SPI协议IP核的设计与验证

    https://wenku.baidu.com/view/9542213131126edb6f1a1048.html?mark_pay_doc=2&mark_rec_page=1&ma ...

  2. 自定义AXI总线形式SPI接口IP核,点亮OLED

    一.前言 最近花费很多精力在算法仿真和实现上,外设接口的调试略有生疏.本文以FPGA控制OLED中的SPI接口为例,重新夯实下基础.重点内容为SPI时序的RTL设计以及AXI-Lite总线分析.当然做 ...

  3. FPGA内部IP核DDS

    项目当中需要正弦信号与余弦信号,首先想到了DDS芯片,例如AD9833.AD9834.由于还需要用FPGA   做一些数据处理,后来干脆直接用FPGA 内部的DDSIP核,同时根据IP核内部的相位累加 ...

  4. 一步一步学ZedBoard & Zynq(四):基于AXI Lite 总线的从设备IP设计

    本帖最后由 xinxincaijq 于 2013-1-9 10:27 编辑 一步一步学ZedBoard & Zynq(四):基于AXI Lite 总线的从设备IP设计 转自博客:http:// ...

  5. AXI-Lite总线及其自定义IP核使用分析总结

    ZYNQ的优势在于通过高效的接口总线组成了ARM+FPGA的架构.我认为两者是互为底层的,当进行算法验证时,ARM端现有的硬件控制器和库函数可以很方便地连接外设,而不像FPGA设计那样完全写出接口时序 ...

  6. 基于MIG IP核的DDR3控制器(一)

    最近学习了DDR3控制器的使用,也用着DDR完成了一些简单工作,想着以后一段可能只用封装过后的IP核,可能会忘记DDR3控制器的一些内容,想着把这个DDR控制器的编写过程记录下来,便于我自己以后查看吧 ...

  7. 每天进步一点点------SOPC的Avalon-MM IP核(二) AVALON总线的IP核定制

    简介 NIOS II是一个建立在FPGA上的嵌入式软核处理器,除了可以根据需要任意添加已经提供的外设外,用户还可以通过定制用户逻辑外设和定制用户指令来实现各种应用要求.这节我们就来研究如何定制基于Av ...

  8. Vivado设计二:zynq的PS访问PL中的自带IP核(基于zybo)

    1.建立工程 首先和Vivado设计一中一样,先建立工程(这部分就忽略了) 2.create block design 同样,Add IP 同样,也添加配置文件,这些都和设计一是一样的,没什么区别. ...

  9. 【6集iCore3_ADP触摸屏驱动讲解视频】6-2 基于FSMC总线的ARM与FPGA通信

    视频简介: 该视频介绍基于FSMC总线的ARM与FPGA通信   源视频包下载地址: 链接:http://pan.baidu.com/s/1slJDoQD 密码:tmw7   银杏科技优酷视频发布区: ...

随机推荐

  1. [补档][COGS 2434]暗之链锁

    [COGS 2434]暗之链锁 题目 传说中的暗之连锁被人们称为Dark.<!--more-->Dark是人类内心的黑暗的产物,古今中外的勇者们都试图打倒它.经过研究,你发现Dark呈现无 ...

  2. hdu 6045 Is Derek lying?(思维推导)

    Problem Description Derek and Alfia are good friends.Derek is Chinese,and Alfia is Austrian.This sum ...

  3. maven系列小技巧之Top3 MVQ(most valuable question)

    首先声明,文章系个人原创 ,欢迎转载,转载请注明出处. 对于maven,有大神曾说过:如果你爱他,就请让他用Maven,因为那里是天堂,如果你恨他,就请让他用Maven,因为那里是地狱.尤其是mave ...

  4. zabbix 3.2.7 (源码包)安装部署

    Zabbix 3.2.7 + CentOS7 安装 环境准备: 操作系统 CentOS Linux release 7.2.1511 (Core) zabbix server 10.30.94.60 ...

  5. CentOS 7 服务器配置--安装nginx

    #安装pcre yum install -y pcre-devel #安装zlib-devel yum install -y zlib-devel #下载nginx wget -r -np -nd h ...

  6. oracle 查询优化改写

    -----------书籍: oracle 查询优化改写-----------第1个“C###oracle”为登录数据库的用户名,第2个“oracleChange”为登录数据库的密码“oracleCh ...

  7. vue.js实现瀑布流之vue-waterfall-easy

    想必大家应该很多都已经习惯了jquery的DOM操作,jquery的瀑布流实现起来也很容易. 但是,随着时代的发展,随着时代的进步..... 算了算了,扯远了,既然能找到这儿来,肯定是在vue.js上 ...

  8. 如何在.xml中配置Servlet信息

    在编写好servlet文件后需要在web.xml文件下配置servlet,才能使servlet在服务器上运行.基本配置方式如下所示 <context-param> <param-na ...

  9. 将本地sql文件导入到mysql中

    cmd命令操作:先创建一个同名数据库,然后通过source导入sql文件 1.启动mysql 2.mysql -uroot -p 输入密码运行mysql 3.创建一个同名数据库 create data ...

  10. easyUI datagrid 列宽自适应(简单 图解)(转)

    响应数据格式: easyUI在html代码中结构: 发现了什么没有,我们的表头其实是一个td在td中有一个属性field那么我们就可以获得了; 以下就是自适应代码: //添加事件 function c ...