将 FIR 滤波器写得更“上游” —— 面向 FPGA 的架构与高性能编码实践

文章来源:OpenFPGA

FPGA 非常适合实现像 FIR 滤波器这样的信号处理功能。器件内的 DSP 单元带有内建的乘加(MAC)能力,非常适合这类应用。但正如 FPGA 设计中的大多数问题一样,可达的性能在很大程度上取决于我们设计的构架。

在基本层面上,一个 FIR 滤波器由三个主要部分组成:

  • A delay line 延迟线

  • Multipliers to apply the coefficients 用于乘以系数的乘法器

  • An accumulator to sum the products 将乘积累加的累加器

这些元素的具体实现方式会对FIR滤波器的性能产生重大影响。

直接型(Direct Form)

在直接型实现中,使用移位寄存器延迟输入采样。在每个延迟级上,样本乘以常数系数,然后把所有级的输出相加。

转置型(Transposed Form)

在转置型实现中,所有乘法器看到的是相同的输入样本。累加器被实现为一串加法器,并在它们之间插入寄存器。这种结构非常契合现代 FPGA 的 DSP 片(DSP slices),尤其当我们利用它们的内部流水线寄存器时效果最佳。

示例设计

下面我们比较这两种架构在 FPGA 上实现时的性能差异。

本例目标器件为 Artix-7,滤波器输入采样率目标 200 MHz,规格如下:

  • 通带:低于 25 MHz

  • 阻带:高于 30 MHz

系数用 TFilter 在线(http://t-filter.engineerjs.com/)工具生成:

1.png

TFilter可交互地创建滤波器。

本次设计的滤波器如下:

2.png

滤波器设计完成后,转入 RTL 编写。虽然可以使用 Vivado 的 FIR Compiler IP,为了清楚地展示架构差异,在此我们手写 RTL。

直接型 RTL 与 Testbench

下面为直接型FIR滤波器的RTL表达式。为了测试该设计,使用了一个cocotb测试平台,该平台施加两个信号:一个位于通带,一个位于阻带。

library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;

entity fir_21tap is
  generic (
    DATA_WIDTH  : integer := 16;
    COEFF_WIDTH : integer := 16
  );
  port (
    clk      : in  std_logic;
    rst      : in  std_logic;
    data_in  : in  signed(DATA_WIDTH-1 downto 0);
    data_out : out signed(15 downto 0)   -- 16-bit output
  );
end entity fir_21tap;

architecture rtl of fir_21tap is

  constant NUM_TAPS   : integer := 21;
  constant PROD_WIDTH : integer := DATA_WIDTH + COEFF_WIDTH;-- 16+16 = 32
  constant ACC_WIDTH  : integer := PROD_WIDTH + 5;   -- headroom for sum

  subtype sample_t is signed(DATA_WIDTH-1 downto 0);
  subtype coeff_t  is signed(COEFF_WIDTH-1 downto 0);
  type sample_array_t is array (0 to NUM_TAPS-1) of sample_t;
  type coeff_array_t  is array (0 to NUM_TAPS-1) of coeff_t;

  -- 21 coefficients (h[0] is for most recent sample, h[20] for oldest)
  constant COEFFS : coeff_array_t := (
    to_signed(   937, COEFF_WIDTH),
    to_signed(  2402, COEFF_WIDTH),
    to_signed(  1479, COEFF_WIDTH),
    to_signed( -1122, COEFF_WIDTH),
    to_signed( -1138, COEFF_WIDTH),
    to_signed(  1751, COEFF_WIDTH),
    to_signed(  1079, COEFF_WIDTH),
    to_signed( -3238, COEFF_WIDTH),
    to_signed( -1119, COEFF_WIDTH),
    to_signed( 10356, COEFF_WIDTH),
    to_signed( 17504, COEFF_WIDTH),
    to_signed( 10356, COEFF_WIDTH),
    to_signed( -1119, COEFF_WIDTH),
    to_signed( -3238, COEFF_WIDTH),
    to_signed(  1079, COEFF_WIDTH),
    to_signed(  1751, COEFF_WIDTH),
    to_signed( -1138, COEFF_WIDTH),
    to_signed( -1122, COEFF_WIDTH),
    to_signed(  1479, COEFF_WIDTH),
    to_signed(  2402, COEFF_WIDTH),
    to_signed(   937, COEFF_WIDTH)
  );

  -- Shift register for samples
  signal x_reg : sample_array_t := (others => (others => '0'));

  -- Full-precision accumulator
  signal acc_reg : signed(ACC_WIDTH-1 downto 0) := (others => '0');

begin

  ------------------------------------------------------------------------
  -- 16-bit output: take the most significant 16 bits of the accumulator
  ------------------------------------------------------------------------
  data_out <= acc_reg(ACC_WIDTH-1 downto ACC_WIDTH-16);

  ------------------------------------------------------------------------
  -- Input sample shift register
  ------------------------------------------------------------------------
  shift_reg_proc : process (clk)
  begin
    if rising_edge(clk) then
      if rst = '1' then
        x_reg <= (others => (others => '0'));
      else
        x_reg(0) <= data_in;
        for i in 1 to NUM_TAPS-1 loop
          x_reg(i) <= x_reg(i-1);
        end loop;
      end if;
    end if;
  end process shift_reg_proc;

  ------------------------------------------------------------------------
  -- Multiply-accumulate
  ------------------------------------------------------------------------
  mac_proc : process (clk)
    variable sum  : signed(ACC_WIDTH-1 downto 0);
    variable prod : signed(PROD_WIDTH-1 downto 0);  -- 32 bits
  begin
    if rising_edge(clk) then
      if rst = '1' then
        acc_reg <= (others => '0');
      else
        sum := (others => '0');

        for i in 0 to NUM_TAPS-1 loop
          -- Multiply 16x16 -> 32 bits, then resize to accumulator width
          prod := x_reg(i) * COEFFS(i);               -- result is 32 bits
          sum  := sum + resize(prod, ACC_WIDTH);
        end loop;

        acc_reg <= sum;
      end if;
    end if;
  end process mac_proc;

end architecture rtl;

cocotb测试平台

import math
import numpy as np

import cocotb
from cocotb.clock import Clock
from cocotb.triggers import RisingEdge, Timer


CLK_FREQ_HZ = 100e6      # 100 MHz sample clock
CLK_PERIOD_NS = 1e9 / CLK_FREQ_HZ  # 10 ns
NUM_TAPS = 21
DATA_WIDTH = 16


def gen_tone(freq_hz, fs_hz, n_samples, amplitude=0.8):
    """
    Generate a sine wave tone as 16-bit signed integers (Q1.15 style).

    freq_hz  : tone frequency
    fs_hz    : sample rate
    n_samples: number of samples
    amplitude: 0.0 .. 1.0 (scaled to full-scale 16-bit)
    """
    t = np.arange(n_samples) / fs_hz
    # Full-scale amplitude for signed 16-bit is 32767
    scale = int((2**(DATA_WIDTH - 1) - 1) * amplitude)

    samples = scale * np.sin(2.0 * math.pi * freq_hz * t)
    return np.round(samples).astype(np.int16)


async def apply_tone(dut, freq_hz, n_samples, label):
    """
    Apply a sine tone to the filter and capture the output.

    Returns: (input_samples, output_samples) as numpy arrays of int16
    """
    fs = CLK_FREQ_HZ
    in_samples = gen_tone(freq_hz, fs, n_samples, amplitude=0.8)

    out_samples = []

    dut._log.info(f"--- Applying {label} tone: {freq_hz/1e6:.2f} MHz ---")

    for i, sample in enumerate(in_samples):
        dut.data_in.value = int(sample)  # cocotb handles signed int
        await RisingEdge(dut.clk)
        # Read signed value from DUT (data_out is signed(15 downto 0))
        out_val = dut.data_out.value.signed_integer
        out_samples.append(out_val)

    # Convert to numpy int16 for convenience
    in_arr = np.array(in_samples, dtype=np.int16)
    out_arr = np.array(out_samples, dtype=np.int32)  # keep wider here

    # Ignore initial transient due to filter latency (~NUM_TAPS)
    steady_start = NUM_TAPS
    in_steady = in_arr[steady_start:]
    out_steady = out_arr[steady_start:]

    # Compute simple RMS to compare levels
    in_rms = math.sqrt(np.mean(in_steady.astype(np.float64)**2))
    out_rms = math.sqrt(np.mean(out_steady.astype(np.float64)**2))

    dut._log.info(
        f"{label} tone {freq_hz/1e6:.2f} MHz: "
        f"input RMS = {in_rms:.2f}, output RMS = {out_rms:.2f}"
    )

    return in_arr, out_arr


@cocotb.test()
async def fir_21tap_pass_stop_tones(dut):
    """
    Test the FIR with:
      - One tone in the passband (< 25 MHz, e.g. 10 MHz)
      - One tone in the stopband (> 25 MHz, e.g. 30 MHz)

    Clock is 100 MHz (10 ns period).
    """

    # Start 100 MHz clock
    cocotb.start_soon(Clock(dut.clk, CLK_PERIOD_NS, units="ns").start())

    # Reset sequence
    dut.rst.value = 1
    dut.data_in.value = 0
    await Timer(5 * CLK_PERIOD_NS, units="ns")
    for _ in range(5):
        await RisingEdge(dut.clk)
    dut.rst.value = 0
    await RisingEdge(dut.clk)

    # Number of samples per tone (you can increase for better spectral resolution)
    N_SAMPLES = 1024

    # Choose two test frequencies relative to 100 MHz sample rate
    pass_freq_hz = 10e6   # 10 MHz, inside passband (< 25 MHz)
    stop_freq_hz = 30e6   # 30 MHz, in stopband (> 25 MHz)

    # Apply passband tone and capture output
    in_pass, out_pass = await apply_tone(
        dut, pass_freq_hz, N_SAMPLES, label="PASSBAND"
    )

    # Small gap between tones (optional)
    for _ in range(10):
        dut.data_in.value = 0
        await RisingEdge(dut.clk)

    # Apply stopband tone and capture output
    in_stop, out_stop = await apply_tone(
        dut, stop_freq_hz, N_SAMPLES, label="STOPBAND"
    )

    # Optional: compute RMS ratio between passband and stopband outputs
    # (ignoring transient at start)
    steady_start = NUM_TAPS
    out_pass_steady = out_pass[steady_start:]
    out_stop_steady = out_stop[steady_start:]

    pass_rms = math.sqrt(
        np.mean(out_pass_steady.astype(np.float64) ** 2)
    )
    stop_rms = math.sqrt(
        np.mean(out_stop_steady.astype(np.float64) ** 2)
    )

    dut._log.info(
        f"Final comparison: passband RMS = {pass_rms:.2f}, "
        f"stopband RMS = {stop_rms:.2f}, "
        f"ratio (stop/pass) = {stop_rms/pass_rms:.3f}"
    )

    # Example loose check: ensure stopband is attenuated by some factor.
    # You can tighten this once you know the expected filter response.
    assert stop_rms < pass_rms, (
        "Expected stopband tone to be attenuated relative to passband tone"
    )

仿真结果显示:在时域上可以清楚看到,通带信号在输出处被保留,而阻带信号被衰减。

3.png

但是,当我们把这个设计综合到 FPGA 时,很快在要求的时钟频率下遇到时序问题。

时序与 FMAX

可以用下式估算设计的潜在最大工作频率:

FMAX (MHz) = 1000 / (T − WNS)

其中:

  • T 是时钟周期(ns)

  • WNS 是 worst negative slack(ns)

在本设计中,长的组合累加路径把我们限制在约 31 MHz 左右。问题不是滤波器本身,而是 RTL 的架构:累加路径没有针对 FPGA 的 DSP 与布线资源优化。

4.png

为 FPGA 重新架构

要为 FPGA 优化滤波器,我们需要:

  • 使用转置形式的 FIR

  • 正确对 DSP 元件做流水线

这意味着要启用 DSP48 的内部流水线寄存器,使设计映射到:

  • DSP48 的 A 和 B 输入寄存器

  • P 输出寄存器

下文给出了转置形式的修订 RTL,并可以复用同一套 cocotb 测试平台验证功能正确性。

library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;

entity fir_21tap_2reg is
  generic (
    DATA_WIDTH  : integer := 16;
    COEFF_WIDTH : integer := 16
  );
  port (
    clk      : in  std_logic;
    rst      : in  std_logic;
    data_in  : in  signed(DATA_WIDTH-1 downto 0);
    data_out : out signed(15 downto 0)   -- 16-bit registered output
  );
end entity fir_21tap_2reg;

architecture rtl of fir_21tap_2reg is

  constant NUM_TAPS   : integer := 21;
  constant PROD_WIDTH : integer := DATA_WIDTH + COEFF_WIDTH; -- 16+16 = 32
  constant ACC_WIDTH  : integer := PROD_WIDTH + 5;           -- headroom
  subtype coeff_t is signed(COEFF_WIDTH-1 downto 0);
  subtype acc_t   is signed(ACC_WIDTH-1 downto 0);
  type coeff_array_t is array (0 to NUM_TAPS-1) of coeff_t;
  type state_array_t is array (0 to NUM_TAPS-2) of acc_t;   -- N-1 states

  ------------------------------------------------------------------------
  -- 21 FIR coefficients h[0]..h[20]
  ------------------------------------------------------------------------
  constant COEFFS : coeff_array_t := (
    to_signed(   937, COEFF_WIDTH),  -- h0
    to_signed(  2402, COEFF_WIDTH),
    to_signed(  1479, COEFF_WIDTH),
    to_signed( -1122, COEFF_WIDTH),
    to_signed( -1138, COEFF_WIDTH),
    to_signed(  1751, COEFF_WIDTH),
    to_signed(  1079, COEFF_WIDTH),
    to_signed( -3238, COEFF_WIDTH),
    to_signed( -1119, COEFF_WIDTH),
    to_signed( 10356, COEFF_WIDTH),
    to_signed( 17504, COEFF_WIDTH),
    to_signed( 10356, COEFF_WIDTH),
    to_signed( -1119, COEFF_WIDTH),
    to_signed( -3238, COEFF_WIDTH),
    to_signed(  1079, COEFF_WIDTH),
    to_signed(  1751, COEFF_WIDTH),
    to_signed( -1138, COEFF_WIDTH),
    to_signed( -1122, COEFF_WIDTH),
    to_signed(  1479, COEFF_WIDTH),
    to_signed(  2402, COEFF_WIDTH),
    to_signed(   937, COEFF_WIDTH)   -- h20
  );

  ------------------------------------------------------------------------
  -- State registers for transposed structure: s(0)..s(19)
  ------------------------------------------------------------------------
  signal state       : state_array_t := (others => (others => '0'));
  signal data_out_reg: signed(15 downto 0) := (others => '0');

begin

  data_out <= data_out_reg;

  ------------------------------------------------------------------------
  -- Transposed-form FIR
  --
  -- Equations (for N=21, indices 0..20):
  --   z_20[n]          = h20 * x[n]
  --   z_k[n]           = h(k+1) * x[n] + z_{k+1}[n-1],  k = 0..19
  --   y[n]             = h0 * x[n] + z_0[n-1]
  --
  -- Here:
  --   state(k) holds z_k[n]      (k = 0..19)
  --   data_out_reg holds y[n]
  ------------------------------------------------------------------------
  fir_proc : process (clk)
    variable x_ext  : signed(DATA_WIDTH-1 downto 0);
    variable prod   : signed(PROD_WIDTH-1 downto 0);
    variable acc    : acc_t;
  begin
    if rising_edge(clk) then
      if rst = '1' then
        state        <= (others => (others => '0'));
        data_out_reg <= (others => '0');

      else
        -- Extend input to full precision once
        x_ext := data_in;

        ------------------------------------------------------------------
        -- Update last state: z_20[n] = h20 * x[n]
        ------------------------------------------------------------------
        prod := x_ext * COEFFS(NUM_TAPS-1);  -- h20 * x[n]
        acc  := resize(prod, ACC_WIDTH);
        state(NUM_TAPS-2) <= acc;           -- state(19)

        ------------------------------------------------------------------
        -- Update remaining states backward:
        --   state(k) <= h(k+1)*x[n] + state(k+1)(old)
        -- Note: state(k+1) on RHS is value from previous cycle (n-1),
        --       because signal reads see "old" value in this clock.
        ------------------------------------------------------------------
        for k in NUM_TAPS-3 downto 0 loop   -- k = 18 .. 0
          prod := x_ext * COEFFS(k+1);      -- h(k+1) * x[n]
          acc  := resize(prod, ACC_WIDTH) + state(k+1);
          state(k) <= acc;
        end loop;

        ------------------------------------------------------------------
        -- Output:
        --   y[n] = h0 * x[n] + z_0[n-1] = h0 * x[n] + state(0)(old)
        ------------------------------------------------------------------
        prod := x_ext * COEFFS(0);   -- h0 * x[n]
        acc  := resize(prod, ACC_WIDTH) + state(0);
        data_out_reg <= acc(ACC_WIDTH-1 downto ACC_WIDTH-16);  -- 16-bit

      end if;
    end if;
  end process;

end architecture rtl;

更新后的设计完成后,即可在目标设备上以所需的时钟频率满足时序要求。

5.png

结论

这个示例表明,为 FPGA 织构(fabric)而写的 RTL 在实现高性能上至关重要。两个功能上等价的 FIR 滤波器在实现后的表现可能大相径庭,关键在于它们映射到器件 DSP 与布线资源的契合程度。