diff options
Diffstat (limited to 'fpga/usrp3/lib/rfnoc/multiply.v')
-rw-r--r-- | fpga/usrp3/lib/rfnoc/multiply.v | 138 |
1 files changed, 138 insertions, 0 deletions
diff --git a/fpga/usrp3/lib/rfnoc/multiply.v b/fpga/usrp3/lib/rfnoc/multiply.v new file mode 100644 index 000000000..ad0353c66 --- /dev/null +++ b/fpga/usrp3/lib/rfnoc/multiply.v @@ -0,0 +1,138 @@ +// +// Copyright 2015 Ettus Research +// +// AXI Stream multiplier. Relies on synthesis engine for proper DSP inference. + +module multiply #( + parameter WIDTH_A = 16, + parameter WIDTH_B = 16, + parameter WIDTH_P = 32, + parameter DROP_TOP_P = 1, // Default drops extra bit (16-bit signed x 16-bit signed => 31-bits signed) + parameter LATENCY = 3, // multiplier pipeline latency, 0 - 4 + parameter EN_SATURATE = 0, // Enable saturating output to avoid overflow (adds +1 to latency) + parameter EN_ROUND = 0, // Enable rounding dropped LSBs (adds +1 to latency, total of +2 if used with EN_SATURATE) + parameter SIGNED = 1) // Signed multiply +( + input clk, input reset, + input [WIDTH_A-1:0] a_tdata, input a_tlast, input a_tvalid, output a_tready, + input [WIDTH_B-1:0] b_tdata, input b_tlast, input b_tvalid, output b_tready, + output [WIDTH_P-1:0] p_tdata, output p_tlast, output p_tvalid, input p_tready +); + + localparam A_LATENCY = (LATENCY == 1) ? 1 : + (LATENCY == 2) ? 1 : + (LATENCY == 3) ? 2 : + (LATENCY == 4) ? 2 : 2; + localparam B_LATENCY = A_LATENCY; + localparam P_LATENCY = (LATENCY == 2) ? 1 : + (LATENCY == 3) ? 1 : + (LATENCY == 4) ? 2 : 2; + + reg [WIDTH_A-1:0] a_reg[A_LATENCY-1:0]; + reg [WIDTH_B-1:0] b_reg[B_LATENCY-1:0]; + reg [WIDTH_A+WIDTH_B-1:0] p_reg[P_LATENCY-1:0]; + + wire [A_LATENCY-1:0] en_a_reg; + wire [B_LATENCY-1:0] en_b_reg; + wire [P_LATENCY-1:0] en_p_reg; + wire p_int_tlast, p_int_tvalid, p_int_tready; + axi_pipe_join #( + .PRE_JOIN_STAGES0(A_LATENCY), + .PRE_JOIN_STAGES1(B_LATENCY), + .POST_JOIN_STAGES(P_LATENCY)) + axi_pipe_join ( + .clk(clk), .reset(reset), .clear(1'b0), + .i0_tlast(a_tlast), .i0_tvalid(a_tvalid), .i0_tready(a_tready), + .i1_tlast(b_tlast), .i1_tvalid(b_tvalid), .i1_tready(b_tready), + .o_tlast(p_int_tlast), .o_tvalid(p_int_tvalid), .o_tready(p_int_tready), + .enables0(en_a_reg), .enables1(en_b_reg), .enables_post(en_p_reg)); + + // Multiply + wire [WIDTH_A+WIDTH_B-1:0] p_mult_signed = (LATENCY == 0) ? $signed(a_tdata) * $signed(b_tdata) : $signed(a_reg[A_LATENCY-1]) * $signed(b_reg[B_LATENCY-1]); + wire [WIDTH_A+WIDTH_B-1:0] p_mult_unsigned = (LATENCY == 0) ? a_tdata * b_tdata : a_reg[A_LATENCY-1] * b_reg[B_LATENCY-1]; + wire [WIDTH_A+WIDTH_B-1:0] p_int_tdata = (LATENCY == 0) ? (SIGNED ? p_mult_signed : p_mult_unsigned) : p_reg[P_LATENCY-1]; + + // Register pipeline + integer i; + always @(posedge clk) begin + if (reset) begin + for (i = 0; i < A_LATENCY; i = i + 1) begin + a_reg[i] <= 'd0; + end + for (i = 0; i < B_LATENCY; i = i + 1) begin + b_reg[i] <= 'd0; + end + for (i = 0; i < P_LATENCY; i = i + 1) begin + p_reg[i] <= 'd0; + end + end else begin + for (i = 0; i < A_LATENCY; i = i + 1) begin + if (en_a_reg[i]) begin + if (i == 0) begin + a_reg[i] <= $signed(a_tdata); + end else begin + a_reg[i] <= a_reg[i-1]; + end + end + end + for (i = 0; i < B_LATENCY; i = i + 1) begin + if (en_b_reg[i]) begin + if (i == 0) begin + b_reg[i] <= $signed(b_tdata); + end else begin + b_reg[i] <= b_reg[i-1]; + end + end + end + for (i = 0; i < P_LATENCY; i = i + 1) begin + if (en_p_reg[i]) begin + if (i == 0) begin + p_reg[i] <= SIGNED ? p_mult_signed : p_mult_unsigned; + end else begin + p_reg[i] <= p_reg[i-1]; + end + end + end + end + end + + // Saturate & Round + // TODO: Might be able to replace axi_round with DSP's built in rounding + generate + if ((EN_SATURATE == 1) && (EN_ROUND == 1)) begin + axi_round_and_clip #( + .WIDTH_IN(WIDTH_A+WIDTH_B), + .WIDTH_OUT(WIDTH_P), + .CLIP_BITS(DROP_TOP_P)) + axi_round_and_clip ( + .clk(clk), .reset(reset), + .i_tdata(p_int_tdata), .i_tlast(p_int_tlast), .i_tvalid(p_int_tvalid), .i_tready(p_int_tready), + .o_tdata(p_tdata), .o_tlast(p_tlast), .o_tvalid(p_tvalid), .o_tready(p_tready)); + end else if ((EN_SATURATE == 0) && (EN_ROUND == 1)) begin + axi_round #( + .WIDTH_IN(WIDTH_A+WIDTH_B-DROP_TOP_P), + .WIDTH_OUT(WIDTH_P)) + axi_round ( + .clk(clk), .reset(reset), + .i_tdata(p_int_tdata[WIDTH_A+WIDTH_B-DROP_TOP_P-1:0]), .i_tlast(p_int_tlast), .i_tvalid(p_int_tvalid), .i_tready(p_int_tready), + .o_tdata(p_tdata), .o_tlast(p_tlast), .o_tvalid(p_tvalid), .o_tready(p_tready)); + end else if ((EN_SATURATE == 1) && (EN_ROUND == 0)) begin + wire [WIDTH_A+WIDTH_B-DROP_TOP_P-1:0] p_clip_tdata; + axi_clip #( + .WIDTH_IN(WIDTH_A+WIDTH_B), + .WIDTH_OUT(WIDTH_A+WIDTH_B-DROP_TOP_P), + .CLIP_BITS(DROP_TOP_P)) + axi_clip ( + .clk(clk), .reset(reset), + .i_tdata(p_int_tdata), .i_tlast(p_int_tlast), .i_tvalid(p_int_tvalid), .i_tready(p_int_tready), + .o_tdata(p_clip_tdata), .o_tlast(p_tlast), .o_tvalid(p_tvalid), .o_tready(p_tready)); + assign p_tdata = p_clip_tdata[WIDTH_A+WIDTH_B-DROP_TOP_P-1:WIDTH_A+WIDTH_B-DROP_TOP_P-WIDTH_P]; + end else begin + assign p_tdata = p_int_tdata[WIDTH_A+WIDTH_B-DROP_TOP_P-1:WIDTH_A+WIDTH_B-DROP_TOP_P-WIDTH_P]; + assign p_tlast = p_int_tlast; + assign p_tvalid = p_int_tvalid; + assign p_int_tready = p_tready; + end + endgenerate + +endmodule |