将 FIR 滤波器写得更“上游” —— 面向 FPGA 的架构与高性能编码实践

judy 在周三, 12/10/2025 - 11:17 提交

文章来源：OpenFPGA

FPGA 非常适合实现像 FIR 滤波器这样的信号处理功能。器件内的 DSP 单元带有内建的乘加（MAC）能力，非常适合这类应用。但正如 FPGA 设计中的大多数问题一样，可达的性能在很大程度上取决于我们设计的构架。

在基本层面上，一个 FIR 滤波器由三个主要部分组成：

A delay line 延迟线
Multipliers to apply the coefficients 用于乘以系数的乘法器
An accumulator to sum the products 将乘积累加的累加器

这些元素的具体实现方式会对FIR滤波器的性能产生重大影响。

直接型（Direct Form）

在直接型实现中，使用移位寄存器延迟输入采样。在每个延迟级上，样本乘以常数系数，然后把所有级的输出相加。

转置型（Transposed Form）

在转置型实现中，所有乘法器看到的是相同的输入样本。累加器被实现为一串加法器，并在它们之间插入寄存器。这种结构非常契合现代 FPGA 的 DSP 片（DSP slices），尤其当我们利用它们的内部流水线寄存器时效果最佳。

示例设计

下面我们比较这两种架构在 FPGA 上实现时的性能差异。

本例目标器件为 Artix-7，滤波器输入采样率目标 200 MHz，规格如下：

通带：低于 25 MHz
阻带：高于 30 MHz

系数用 TFilter 在线（http://t-filter.engineerjs.com/）工具生成：

TFilter可交互地创建滤波器。

本次设计的滤波器如下：

滤波器设计完成后，转入 RTL 编写。虽然可以使用 Vivado 的 FIR Compiler IP，为了清楚地展示架构差异，在此我们手写 RTL。

直接型 RTL 与 Testbench

下面为直接型FIR滤波器的RTL表达式。为了测试该设计，使用了一个cocotb测试平台，该平台施加两个信号：一个位于通带，一个位于阻带。

library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;

entity fir_21tap is
generic (
DATA_WIDTH : integer := 16;
COEFF_WIDTH : integer := 16
);
port (
clk : in std_logic;
rst : in std_logic;
data_in : in signed(DATA_WIDTH-1 downto 0);
data_out : out signed(15 downto 0) -- 16-bit output
);
end entity fir_21tap;

architecture rtl of fir_21tap is

constant NUM_TAPS : integer := 21;
constant PROD_WIDTH : integer := DATA_WIDTH + COEFF_WIDTH;-- 16+16 = 32
constant ACC_WIDTH : integer := PROD_WIDTH + 5; -- headroom for sum

subtype sample_t is signed(DATA_WIDTH-1 downto 0);
subtype coeff_t is signed(COEFF_WIDTH-1 downto 0);
  type sample_array_t is array (0 to NUM_TAPS-1) of sample_t;
  type coeff_array_t is array (0 to NUM_TAPS-1) of coeff_t;

-- 21 coefficients (h[0] is for most recent sample, h[20] for oldest)
constant COEFFS : coeff_array_t := (
to_signed( 937, COEFF_WIDTH),
to_signed( 2402, COEFF_WIDTH),
to_signed( 1479, COEFF_WIDTH),
to_signed( -1122, COEFF_WIDTH),
to_signed( -1138, COEFF_WIDTH),
to_signed( 1751, COEFF_WIDTH),
to_signed( 1079, COEFF_WIDTH),
to_signed( -3238, COEFF_WIDTH),
to_signed( -1119, COEFF_WIDTH),
to_signed( 10356, COEFF_WIDTH),
to_signed( 17504, COEFF_WIDTH),
to_signed( 10356, COEFF_WIDTH),
to_signed( -1119, COEFF_WIDTH),
to_signed( -3238, COEFF_WIDTH),
to_signed( 1079, COEFF_WIDTH),
to_signed( 1751, COEFF_WIDTH),
to_signed( -1138, COEFF_WIDTH),
to_signed( -1122, COEFF_WIDTH),
to_signed( 1479, COEFF_WIDTH),
to_signed( 2402, COEFF_WIDTH),
to_signed( 937, COEFF_WIDTH)
);

-- Shift register for samples
signal x_reg : sample_array_t := (others => (others => '0'));

-- Full-precision accumulator
signal acc_reg : signed(ACC_WIDTH-1 downto 0) := (others => '0');

begin

------------------------------------------------------------------------
-- 16-bit output: take the most significant 16 bits of the accumulator
------------------------------------------------------------------------
data_out <= acc_reg(ACC_WIDTH-1 downto ACC_WIDTH-16);

------------------------------------------------------------------------
-- Input sample shift register
------------------------------------------------------------------------
shift_reg_proc : process (clk)
begin
  if rising_edge(clk) then
  if rst = '1' then
x_reg <= (others => (others => '0'));
  else
x_reg(0) <= data_in;
  for i in 1 to NUM_TAPS-1 loop
x_reg(i) <= x_reg(i-1);
end loop;
end if;
end if;
end process shift_reg_proc;

------------------------------------------------------------------------
-- Multiply-accumulate
------------------------------------------------------------------------
mac_proc : process (clk)
variable sum : signed(ACC_WIDTH-1 downto 0);
variable prod : signed(PROD_WIDTH-1 downto 0); -- 32 bits
begin
  if rising_edge(clk) then
  if rst = '1' then
acc_reg <= (others => '0');
  else
sum := (others => '0');

  for i in 0 to NUM_TAPS-1 loop
-- Multiply 16x16 -> 32 bits, then resize to accumulator width
prod := x_reg(i) * COEFFS(i); -- result is 32 bits
sum := sum + resize(prod, ACC_WIDTH);
end loop;

acc_reg <= sum;
end if;
end if;
end process mac_proc;

end architecture rtl;

cocotb测试平台

import math
import numpy as np

import cocotb
from cocotb.clock import Clock
from cocotb.triggers import RisingEdge, Timer

CLK_FREQ_HZ = 100e6 # 100 MHz sample clock
CLK_PERIOD_NS = 1e9 / CLK_FREQ_HZ # 10 ns
NUM_TAPS = 21
DATA_WIDTH = 16

def gen_tone(freq_hz, fs_hz, n_samples, amplitude=0.8):
  """
Generate a sine wave tone as 16-bit signed integers (Q1.15 style).

freq_hz : tone frequency
fs_hz : sample rate
n_samples: number of samples
amplitude: 0.0 .. 1.0 (scaled to full-scale 16-bit)
"""
t = np.arange(n_samples) / fs_hz
  # Full-scale amplitude for signed 16-bit is 32767
scale = int((2**(DATA_WIDTH - 1) - 1) * amplitude)

samples = scale * np.sin(2.0 * math.pi * freq_hz * t)
  return np.round(samples).astype(np.int16)

async def apply_tone(dut, freq_hz, n_samples, label):
  """
Apply a sine tone to the filter and capture the output.

Returns: (input_samples, output_samples) as numpy arrays of int16
"""
fs = CLK_FREQ_HZ
in_samples = gen_tone(freq_hz, fs, n_samples, amplitude=0.8)

out_samples = []

dut._log.info(f"--- Applying {label} tone: {freq_hz/1e6:.2f} MHz ---")

  for i, sample in enumerate(in_samples):
dut.data_in.value = int(sample) # cocotb handles signed int
await RisingEdge(dut.clk)
  # Read signed value from DUT (data_out is signed(15 downto 0))
out_val = dut.data_out.value.signed_integer
out_samples.append(out_val)

  # Convert to numpy int16 for convenience
in_arr = np.array(in_samples, dtype=np.int16)
out_arr = np.array(out_samples, dtype=np.int32) # keep wider here

  # Ignore initial transient due to filter latency (~NUM_TAPS)
steady_start = NUM_TAPS
in_steady = in_arr[steady_start:]
out_steady = out_arr[steady_start:]

  # Compute simple RMS to compare levels
in_rms = math.sqrt(np.mean(in_steady.astype(np.float64)**2))
out_rms = math.sqrt(np.mean(out_steady.astype(np.float64)**2))

dut._log.info(
f"{label} tone {freq_hz/1e6:.2f} MHz: "
f"input RMS = {in_rms:.2f}, output RMS = {out_rms:.2f}"
)

  return in_arr, out_arr

@cocotb.test()
async def fir_21tap_pass_stop_tones(dut):
  """
Test the FIR with:
- One tone in the passband (< 25 MHz, e.g. 10 MHz)
- One tone in the stopband (> 25 MHz, e.g. 30 MHz)

Clock is 100 MHz (10 ns period).
"""

  # Start 100 MHz clock
cocotb.start_soon(Clock(dut.clk, CLK_PERIOD_NS, units="ns").start())

  # Reset sequence
dut.rst.value = 1
dut.data_in.value = 0
await Timer(5 * CLK_PERIOD_NS, units="ns")
  for _ in range(5):
await RisingEdge(dut.clk)
dut.rst.value = 0
await RisingEdge(dut.clk)

  # Number of samples per tone (you can increase for better spectral resolution)
N_SAMPLES = 1024

  # Choose two test frequencies relative to 100 MHz sample rate
pass_freq_hz = 10e6   # 10 MHz, inside passband (< 25 MHz)
stop_freq_hz = 30e6   # 30 MHz, in stopband (> 25 MHz)

  # Apply passband tone and capture output
in_pass, out_pass = await apply_tone(
dut, pass_freq_hz, N_SAMPLES, label="PASSBAND"
)

  # Small gap between tones (optional)
  for _ in range(10):
dut.data_in.value = 0
await RisingEdge(dut.clk)

  # Apply stopband tone and capture output
in_stop, out_stop = await apply_tone(
dut, stop_freq_hz, N_SAMPLES, label="STOPBAND"
)

  # Optional: compute RMS ratio between passband and stopband outputs
  # (ignoring transient at start)
steady_start = NUM_TAPS
out_pass_steady = out_pass[steady_start:]
out_stop_steady = out_stop[steady_start:]

pass_rms = math.sqrt(
np.mean(out_pass_steady.astype(np.float64) ** 2)
)
stop_rms = math.sqrt(
np.mean(out_stop_steady.astype(np.float64) ** 2)
)

dut._log.info(
f"Final comparison: passband RMS = {pass_rms:.2f}, "
f"stopband RMS = {stop_rms:.2f}, "
f"ratio (stop/pass) = {stop_rms/pass_rms:.3f}"
)

  # Example loose check: ensure stopband is attenuated by some factor.
  # You can tighten this once you know the expected filter response.
assert stop_rms < pass_rms, (
  "Expected stopband tone to be attenuated relative to passband tone"
)

仿真结果显示：在时域上可以清楚看到，通带信号在输出处被保留，而阻带信号被衰减。

但是，当我们把这个设计综合到 FPGA 时，很快在要求的时钟频率下遇到时序问题。

时序与 FMAX

可以用下式估算设计的潜在最大工作频率：

FMAX (MHz) = 1000 / (T − WNS)

其中：

T 是时钟周期（ns）
WNS 是 worst negative slack（ns）

在本设计中，长的组合累加路径把我们限制在约 31 MHz 左右。问题不是滤波器本身，而是 RTL 的架构：累加路径没有针对 FPGA 的 DSP 与布线资源优化。

为 FPGA 重新架构

要为 FPGA 优化滤波器，我们需要：

使用转置形式的 FIR
正确对 DSP 元件做流水线

这意味着要启用 DSP48 的内部流水线寄存器，使设计映射到：

DSP48 的 A 和 B 输入寄存器
P 输出寄存器

下文给出了转置形式的修订 RTL，并可以复用同一套 cocotb 测试平台验证功能正确性。

library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;

entity fir_21tap_2reg is
generic (
DATA_WIDTH : integer := 16;
COEFF_WIDTH : integer := 16
);
port (
clk : in std_logic;
rst : in std_logic;
data_in : in signed(DATA_WIDTH-1 downto 0);
data_out : out signed(15 downto 0) -- 16-bit registered output
);
end entity fir_21tap_2reg;

architecture rtl of fir_21tap_2reg is

constant NUM_TAPS : integer := 21;
constant PROD_WIDTH : integer := DATA_WIDTH + COEFF_WIDTH; -- 16+16 = 32
constant ACC_WIDTH : integer := PROD_WIDTH + 5; -- headroom
subtype coeff_t is signed(COEFF_WIDTH-1 downto 0);
subtype acc_t is signed(ACC_WIDTH-1 downto 0);
  type coeff_array_t is array (0 to NUM_TAPS-1) of coeff_t;
  type state_array_t is array (0 to NUM_TAPS-2) of acc_t; -- N-1 states

------------------------------------------------------------------------
-- 21 FIR coefficients h[0]..h[20]
------------------------------------------------------------------------
constant COEFFS : coeff_array_t := (
to_signed( 937, COEFF_WIDTH), -- h0
to_signed( 2402, COEFF_WIDTH),
to_signed( 1479, COEFF_WIDTH),
to_signed( -1122, COEFF_WIDTH),
to_signed( -1138, COEFF_WIDTH),
to_signed( 1751, COEFF_WIDTH),
to_signed( 1079, COEFF_WIDTH),
to_signed( -3238, COEFF_WIDTH),
to_signed( -1119, COEFF_WIDTH),
to_signed( 10356, COEFF_WIDTH),
to_signed( 17504, COEFF_WIDTH),
to_signed( 10356, COEFF_WIDTH),
to_signed( -1119, COEFF_WIDTH),
to_signed( -3238, COEFF_WIDTH),
to_signed( 1079, COEFF_WIDTH),
to_signed( 1751, COEFF_WIDTH),
to_signed( -1138, COEFF_WIDTH),
to_signed( -1122, COEFF_WIDTH),
to_signed( 1479, COEFF_WIDTH),
to_signed( 2402, COEFF_WIDTH),
to_signed( 937, COEFF_WIDTH) -- h20
);

------------------------------------------------------------------------
-- State registers for transposed structure: s(0)..s(19)
------------------------------------------------------------------------
signal state : state_array_t := (others => (others => '0'));
signal data_out_reg: signed(15 downto 0) := (others => '0');

begin

data_out <= data_out_reg;

------------------------------------------------------------------------
-- Transposed-form FIR
--
-- Equations (for N=21, indices 0..20):
-- z_20[n] = h20 * x[n]
-- z_k[n] = h(k+1) * x[n] + z_{k+1}[n-1], k = 0..19
-- y[n] = h0 * x[n] + z_0[n-1]
--
-- Here:
-- state(k) holds z_k[n] (k = 0..19)
-- data_out_reg holds y[n]
------------------------------------------------------------------------
fir_proc : process (clk)
variable x_ext : signed(DATA_WIDTH-1 downto 0);
variable prod : signed(PROD_WIDTH-1 downto 0);
variable acc : acc_t;
begin
  if rising_edge(clk) then
  if rst = '1' then
state <= (others => (others => '0'));
data_out_reg <= (others => '0');

  else
-- Extend input to full precision once
x_ext := data_in;

------------------------------------------------------------------
-- Update last state: z_20[n] = h20 * x[n]
------------------------------------------------------------------
prod := x_ext * COEFFS(NUM_TAPS-1); -- h20 * x[n]
acc := resize(prod, ACC_WIDTH);
state(NUM_TAPS-2) <= acc; -- state(19)

------------------------------------------------------------------
-- Update remaining states backward:
-- state(k) <= h(k+1)*x[n] + state(k+1)(old)
-- Note: state(k+1) on RHS is value from previous cycle (n-1),
-- because signal reads see "old" value in this clock.
------------------------------------------------------------------
  for k in NUM_TAPS-3 downto 0 loop -- k = 18 .. 0
prod := x_ext * COEFFS(k+1); -- h(k+1) * x[n]
acc := resize(prod, ACC_WIDTH) + state(k+1);
state(k) <= acc;
end loop;

------------------------------------------------------------------
-- Output:
-- y[n] = h0 * x[n] + z_0[n-1] = h0 * x[n] + state(0)(old)
------------------------------------------------------------------
prod := x_ext * COEFFS(0); -- h0 * x[n]
acc := resize(prod, ACC_WIDTH) + state(0);
data_out_reg <= acc(ACC_WIDTH-1 downto ACC_WIDTH-16); -- 16-bit

end if;
end if;
end process;

end architecture rtl;

更新后的设计完成后，即可在目标设备上以所需的时钟频率满足时序要求。

结论

这个示例表明，为 FPGA 织构（fabric）而写的 RTL 在实现高性能上至关重要。两个功能上等价的 FIR 滤波器在实现后的表现可能大相径庭，关键在于它们映射到器件 DSP 与布线资源的契合程度。