文章来源:OpenFPGA
FPGA 非常适合实现像 FIR 滤波器这样的信号处理功能。器件内的 DSP 单元带有内建的乘加(MAC)能力,非常适合这类应用。但正如 FPGA 设计中的大多数问题一样,可达的性能在很大程度上取决于我们设计的构架。
在基本层面上,一个 FIR 滤波器由三个主要部分组成:
A delay line 延迟线
Multipliers to apply the coefficients 用于乘以系数的乘法器
An accumulator to sum the products 将乘积累加的累加器
这些元素的具体实现方式会对FIR滤波器的性能产生重大影响。
直接型(Direct Form)
在直接型实现中,使用移位寄存器延迟输入采样。在每个延迟级上,样本乘以常数系数,然后把所有级的输出相加。
转置型(Transposed Form)
在转置型实现中,所有乘法器看到的是相同的输入样本。累加器被实现为一串加法器,并在它们之间插入寄存器。这种结构非常契合现代 FPGA 的 DSP 片(DSP slices),尤其当我们利用它们的内部流水线寄存器时效果最佳。
示例设计
下面我们比较这两种架构在 FPGA 上实现时的性能差异。
本例目标器件为 Artix-7,滤波器输入采样率目标 200 MHz,规格如下:
通带:低于 25 MHz
阻带:高于 30 MHz
系数用 TFilter 在线(http://t-filter.engineerjs.com/)工具生成:

TFilter可交互地创建滤波器。
本次设计的滤波器如下:

滤波器设计完成后,转入 RTL 编写。虽然可以使用 Vivado 的 FIR Compiler IP,为了清楚地展示架构差异,在此我们手写 RTL。
直接型 RTL 与 Testbench
下面为直接型FIR滤波器的RTL表达式。为了测试该设计,使用了一个cocotb测试平台,该平台施加两个信号:一个位于通带,一个位于阻带。
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
entity fir_21tap is
generic (
DATA_WIDTH : integer := 16;
COEFF_WIDTH : integer := 16
);
port (
clk : in std_logic;
rst : in std_logic;
data_in : in signed(DATA_WIDTH-1 downto 0);
data_out : out signed(15 downto 0) -- 16-bit output
);
end entity fir_21tap;
architecture rtl of fir_21tap is
constant NUM_TAPS : integer := 21;
constant PROD_WIDTH : integer := DATA_WIDTH + COEFF_WIDTH;-- 16+16 = 32
constant ACC_WIDTH : integer := PROD_WIDTH + 5; -- headroom for sum
subtype sample_t is signed(DATA_WIDTH-1 downto 0);
subtype coeff_t is signed(COEFF_WIDTH-1 downto 0);
type sample_array_t is array (0 to NUM_TAPS-1) of sample_t;
type coeff_array_t is array (0 to NUM_TAPS-1) of coeff_t;
-- 21 coefficients (h[0] is for most recent sample, h[20] for oldest)
constant COEFFS : coeff_array_t := (
to_signed( 937, COEFF_WIDTH),
to_signed( 2402, COEFF_WIDTH),
to_signed( 1479, COEFF_WIDTH),
to_signed( -1122, COEFF_WIDTH),
to_signed( -1138, COEFF_WIDTH),
to_signed( 1751, COEFF_WIDTH),
to_signed( 1079, COEFF_WIDTH),
to_signed( -3238, COEFF_WIDTH),
to_signed( -1119, COEFF_WIDTH),
to_signed( 10356, COEFF_WIDTH),
to_signed( 17504, COEFF_WIDTH),
to_signed( 10356, COEFF_WIDTH),
to_signed( -1119, COEFF_WIDTH),
to_signed( -3238, COEFF_WIDTH),
to_signed( 1079, COEFF_WIDTH),
to_signed( 1751, COEFF_WIDTH),
to_signed( -1138, COEFF_WIDTH),
to_signed( -1122, COEFF_WIDTH),
to_signed( 1479, COEFF_WIDTH),
to_signed( 2402, COEFF_WIDTH),
to_signed( 937, COEFF_WIDTH)
);
-- Shift register for samples
signal x_reg : sample_array_t := (others => (others => '0'));
-- Full-precision accumulator
signal acc_reg : signed(ACC_WIDTH-1 downto 0) := (others => '0');
begin
------------------------------------------------------------------------
-- 16-bit output: take the most significant 16 bits of the accumulator
------------------------------------------------------------------------
data_out <= acc_reg(ACC_WIDTH-1 downto ACC_WIDTH-16);
------------------------------------------------------------------------
-- Input sample shift register
------------------------------------------------------------------------
shift_reg_proc : process (clk)
begin
if rising_edge(clk) then
if rst = '1' then
x_reg <= (others => (others => '0'));
else
x_reg(0) <= data_in;
for i in 1 to NUM_TAPS-1 loop
x_reg(i) <= x_reg(i-1);
end loop;
end if;
end if;
end process shift_reg_proc;
------------------------------------------------------------------------
-- Multiply-accumulate
------------------------------------------------------------------------
mac_proc : process (clk)
variable sum : signed(ACC_WIDTH-1 downto 0);
variable prod : signed(PROD_WIDTH-1 downto 0); -- 32 bits
begin
if rising_edge(clk) then
if rst = '1' then
acc_reg <= (others => '0');
else
sum := (others => '0');
for i in 0 to NUM_TAPS-1 loop
-- Multiply 16x16 -> 32 bits, then resize to accumulator width
prod := x_reg(i) * COEFFS(i); -- result is 32 bits
sum := sum + resize(prod, ACC_WIDTH);
end loop;
acc_reg <= sum;
end if;
end if;
end process mac_proc;
end architecture rtl;
cocotb测试平台
import math
import numpy as np
import cocotb
from cocotb.clock import Clock
from cocotb.triggers import RisingEdge, Timer
CLK_FREQ_HZ = 100e6 # 100 MHz sample clock
CLK_PERIOD_NS = 1e9 / CLK_FREQ_HZ # 10 ns
NUM_TAPS = 21
DATA_WIDTH = 16
def gen_tone(freq_hz, fs_hz, n_samples, amplitude=0.8):
"""
Generate a sine wave tone as 16-bit signed integers (Q1.15 style).
freq_hz : tone frequency
fs_hz : sample rate
n_samples: number of samples
amplitude: 0.0 .. 1.0 (scaled to full-scale 16-bit)
"""
t = np.arange(n_samples) / fs_hz
# Full-scale amplitude for signed 16-bit is 32767
scale = int((2**(DATA_WIDTH - 1) - 1) * amplitude)
samples = scale * np.sin(2.0 * math.pi * freq_hz * t)
return np.round(samples).astype(np.int16)
async def apply_tone(dut, freq_hz, n_samples, label):
"""
Apply a sine tone to the filter and capture the output.
Returns: (input_samples, output_samples) as numpy arrays of int16
"""
fs = CLK_FREQ_HZ
in_samples = gen_tone(freq_hz, fs, n_samples, amplitude=0.8)
out_samples = []
dut._log.info(f"--- Applying {label} tone: {freq_hz/1e6:.2f} MHz ---")
for i, sample in enumerate(in_samples):
dut.data_in.value = int(sample) # cocotb handles signed int
await RisingEdge(dut.clk)
# Read signed value from DUT (data_out is signed(15 downto 0))
out_val = dut.data_out.value.signed_integer
out_samples.append(out_val)
# Convert to numpy int16 for convenience
in_arr = np.array(in_samples, dtype=np.int16)
out_arr = np.array(out_samples, dtype=np.int32) # keep wider here
# Ignore initial transient due to filter latency (~NUM_TAPS)
steady_start = NUM_TAPS
in_steady = in_arr[steady_start:]
out_steady = out_arr[steady_start:]
# Compute simple RMS to compare levels
in_rms = math.sqrt(np.mean(in_steady.astype(np.float64)**2))
out_rms = math.sqrt(np.mean(out_steady.astype(np.float64)**2))
dut._log.info(
f"{label} tone {freq_hz/1e6:.2f} MHz: "
f"input RMS = {in_rms:.2f}, output RMS = {out_rms:.2f}"
)
return in_arr, out_arr
@cocotb.test()
async def fir_21tap_pass_stop_tones(dut):
"""
Test the FIR with:
- One tone in the passband (< 25 MHz, e.g. 10 MHz)
- One tone in the stopband (> 25 MHz, e.g. 30 MHz)
Clock is 100 MHz (10 ns period).
"""
# Start 100 MHz clock
cocotb.start_soon(Clock(dut.clk, CLK_PERIOD_NS, units="ns").start())
# Reset sequence
dut.rst.value = 1
dut.data_in.value = 0
await Timer(5 * CLK_PERIOD_NS, units="ns")
for _ in range(5):
await RisingEdge(dut.clk)
dut.rst.value = 0
await RisingEdge(dut.clk)
# Number of samples per tone (you can increase for better spectral resolution)
N_SAMPLES = 1024
# Choose two test frequencies relative to 100 MHz sample rate
pass_freq_hz = 10e6 # 10 MHz, inside passband (< 25 MHz)
stop_freq_hz = 30e6 # 30 MHz, in stopband (> 25 MHz)
# Apply passband tone and capture output
in_pass, out_pass = await apply_tone(
dut, pass_freq_hz, N_SAMPLES, label="PASSBAND"
)
# Small gap between tones (optional)
for _ in range(10):
dut.data_in.value = 0
await RisingEdge(dut.clk)
# Apply stopband tone and capture output
in_stop, out_stop = await apply_tone(
dut, stop_freq_hz, N_SAMPLES, label="STOPBAND"
)
# Optional: compute RMS ratio between passband and stopband outputs
# (ignoring transient at start)
steady_start = NUM_TAPS
out_pass_steady = out_pass[steady_start:]
out_stop_steady = out_stop[steady_start:]
pass_rms = math.sqrt(
np.mean(out_pass_steady.astype(np.float64) ** 2)
)
stop_rms = math.sqrt(
np.mean(out_stop_steady.astype(np.float64) ** 2)
)
dut._log.info(
f"Final comparison: passband RMS = {pass_rms:.2f}, "
f"stopband RMS = {stop_rms:.2f}, "
f"ratio (stop/pass) = {stop_rms/pass_rms:.3f}"
)
# Example loose check: ensure stopband is attenuated by some factor.
# You can tighten this once you know the expected filter response.
assert stop_rms < pass_rms, (
"Expected stopband tone to be attenuated relative to passband tone"
)
仿真结果显示:在时域上可以清楚看到,通带信号在输出处被保留,而阻带信号被衰减。

但是,当我们把这个设计综合到 FPGA 时,很快在要求的时钟频率下遇到时序问题。
时序与 FMAX
可以用下式估算设计的潜在最大工作频率:
FMAX (MHz) = 1000 / (T − WNS)
其中:
T 是时钟周期(ns)
WNS 是 worst negative slack(ns)
在本设计中,长的组合累加路径把我们限制在约 31 MHz 左右。问题不是滤波器本身,而是 RTL 的架构:累加路径没有针对 FPGA 的 DSP 与布线资源优化。

为 FPGA 重新架构
要为 FPGA 优化滤波器,我们需要:
使用转置形式的 FIR
正确对 DSP 元件做流水线
这意味着要启用 DSP48 的内部流水线寄存器,使设计映射到:
DSP48 的 A 和 B 输入寄存器
P 输出寄存器
下文给出了转置形式的修订 RTL,并可以复用同一套 cocotb 测试平台验证功能正确性。
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
entity fir_21tap_2reg is
generic (
DATA_WIDTH : integer := 16;
COEFF_WIDTH : integer := 16
);
port (
clk : in std_logic;
rst : in std_logic;
data_in : in signed(DATA_WIDTH-1 downto 0);
data_out : out signed(15 downto 0) -- 16-bit registered output
);
end entity fir_21tap_2reg;
architecture rtl of fir_21tap_2reg is
constant NUM_TAPS : integer := 21;
constant PROD_WIDTH : integer := DATA_WIDTH + COEFF_WIDTH; -- 16+16 = 32
constant ACC_WIDTH : integer := PROD_WIDTH + 5; -- headroom
subtype coeff_t is signed(COEFF_WIDTH-1 downto 0);
subtype acc_t is signed(ACC_WIDTH-1 downto 0);
type coeff_array_t is array (0 to NUM_TAPS-1) of coeff_t;
type state_array_t is array (0 to NUM_TAPS-2) of acc_t; -- N-1 states
------------------------------------------------------------------------
-- 21 FIR coefficients h[0]..h[20]
------------------------------------------------------------------------
constant COEFFS : coeff_array_t := (
to_signed( 937, COEFF_WIDTH), -- h0
to_signed( 2402, COEFF_WIDTH),
to_signed( 1479, COEFF_WIDTH),
to_signed( -1122, COEFF_WIDTH),
to_signed( -1138, COEFF_WIDTH),
to_signed( 1751, COEFF_WIDTH),
to_signed( 1079, COEFF_WIDTH),
to_signed( -3238, COEFF_WIDTH),
to_signed( -1119, COEFF_WIDTH),
to_signed( 10356, COEFF_WIDTH),
to_signed( 17504, COEFF_WIDTH),
to_signed( 10356, COEFF_WIDTH),
to_signed( -1119, COEFF_WIDTH),
to_signed( -3238, COEFF_WIDTH),
to_signed( 1079, COEFF_WIDTH),
to_signed( 1751, COEFF_WIDTH),
to_signed( -1138, COEFF_WIDTH),
to_signed( -1122, COEFF_WIDTH),
to_signed( 1479, COEFF_WIDTH),
to_signed( 2402, COEFF_WIDTH),
to_signed( 937, COEFF_WIDTH) -- h20
);
------------------------------------------------------------------------
-- State registers for transposed structure: s(0)..s(19)
------------------------------------------------------------------------
signal state : state_array_t := (others => (others => '0'));
signal data_out_reg: signed(15 downto 0) := (others => '0');
begin
data_out <= data_out_reg;
------------------------------------------------------------------------
-- Transposed-form FIR
--
-- Equations (for N=21, indices 0..20):
-- z_20[n] = h20 * x[n]
-- z_k[n] = h(k+1) * x[n] + z_{k+1}[n-1], k = 0..19
-- y[n] = h0 * x[n] + z_0[n-1]
--
-- Here:
-- state(k) holds z_k[n] (k = 0..19)
-- data_out_reg holds y[n]
------------------------------------------------------------------------
fir_proc : process (clk)
variable x_ext : signed(DATA_WIDTH-1 downto 0);
variable prod : signed(PROD_WIDTH-1 downto 0);
variable acc : acc_t;
begin
if rising_edge(clk) then
if rst = '1' then
state <= (others => (others => '0'));
data_out_reg <= (others => '0');
else
-- Extend input to full precision once
x_ext := data_in;
------------------------------------------------------------------
-- Update last state: z_20[n] = h20 * x[n]
------------------------------------------------------------------
prod := x_ext * COEFFS(NUM_TAPS-1); -- h20 * x[n]
acc := resize(prod, ACC_WIDTH);
state(NUM_TAPS-2) <= acc; -- state(19)
------------------------------------------------------------------
-- Update remaining states backward:
-- state(k) <= h(k+1)*x[n] + state(k+1)(old)
-- Note: state(k+1) on RHS is value from previous cycle (n-1),
-- because signal reads see "old" value in this clock.
------------------------------------------------------------------
for k in NUM_TAPS-3 downto 0 loop -- k = 18 .. 0
prod := x_ext * COEFFS(k+1); -- h(k+1) * x[n]
acc := resize(prod, ACC_WIDTH) + state(k+1);
state(k) <= acc;
end loop;
------------------------------------------------------------------
-- Output:
-- y[n] = h0 * x[n] + z_0[n-1] = h0 * x[n] + state(0)(old)
------------------------------------------------------------------
prod := x_ext * COEFFS(0); -- h0 * x[n]
acc := resize(prod, ACC_WIDTH) + state(0);
data_out_reg <= acc(ACC_WIDTH-1 downto ACC_WIDTH-16); -- 16-bit
end if;
end if;
end process;
end architecture rtl;
更新后的设计完成后,即可在目标设备上以所需的时钟频率满足时序要求。

结论
这个示例表明,为 FPGA 织构(fabric)而写的 RTL 在实现高性能上至关重要。两个功能上等价的 FIR 滤波器在实现后的表现可能大相径庭,关键在于它们映射到器件 DSP 与布线资源的契合程度。