aboutsummaryrefslogtreecommitdiffstats
path: root/fpga/usrp3/lib/rfnoc/multiply.v
diff options
context:
space:
mode:
Diffstat (limited to 'fpga/usrp3/lib/rfnoc/multiply.v')
-rw-r--r--fpga/usrp3/lib/rfnoc/multiply.v138
1 files changed, 138 insertions, 0 deletions
diff --git a/fpga/usrp3/lib/rfnoc/multiply.v b/fpga/usrp3/lib/rfnoc/multiply.v
new file mode 100644
index 000000000..ad0353c66
--- /dev/null
+++ b/fpga/usrp3/lib/rfnoc/multiply.v
@@ -0,0 +1,138 @@
+//
+// Copyright 2015 Ettus Research
+//
+// AXI Stream multiplier. Relies on synthesis engine for proper DSP inference.
+
+module multiply #(
+ parameter WIDTH_A = 16,
+ parameter WIDTH_B = 16,
+ parameter WIDTH_P = 32,
+ parameter DROP_TOP_P = 1, // Default drops extra bit (16-bit signed x 16-bit signed => 31-bits signed)
+ parameter LATENCY = 3, // multiplier pipeline latency, 0 - 4
+ parameter EN_SATURATE = 0, // Enable saturating output to avoid overflow (adds +1 to latency)
+ parameter EN_ROUND = 0, // Enable rounding dropped LSBs (adds +1 to latency, total of +2 if used with EN_SATURATE)
+ parameter SIGNED = 1) // Signed multiply
+(
+ input clk, input reset,
+ input [WIDTH_A-1:0] a_tdata, input a_tlast, input a_tvalid, output a_tready,
+ input [WIDTH_B-1:0] b_tdata, input b_tlast, input b_tvalid, output b_tready,
+ output [WIDTH_P-1:0] p_tdata, output p_tlast, output p_tvalid, input p_tready
+);
+
+ localparam A_LATENCY = (LATENCY == 1) ? 1 :
+ (LATENCY == 2) ? 1 :
+ (LATENCY == 3) ? 2 :
+ (LATENCY == 4) ? 2 : 2;
+ localparam B_LATENCY = A_LATENCY;
+ localparam P_LATENCY = (LATENCY == 2) ? 1 :
+ (LATENCY == 3) ? 1 :
+ (LATENCY == 4) ? 2 : 2;
+
+ reg [WIDTH_A-1:0] a_reg[A_LATENCY-1:0];
+ reg [WIDTH_B-1:0] b_reg[B_LATENCY-1:0];
+ reg [WIDTH_A+WIDTH_B-1:0] p_reg[P_LATENCY-1:0];
+
+ wire [A_LATENCY-1:0] en_a_reg;
+ wire [B_LATENCY-1:0] en_b_reg;
+ wire [P_LATENCY-1:0] en_p_reg;
+ wire p_int_tlast, p_int_tvalid, p_int_tready;
+ axi_pipe_join #(
+ .PRE_JOIN_STAGES0(A_LATENCY),
+ .PRE_JOIN_STAGES1(B_LATENCY),
+ .POST_JOIN_STAGES(P_LATENCY))
+ axi_pipe_join (
+ .clk(clk), .reset(reset), .clear(1'b0),
+ .i0_tlast(a_tlast), .i0_tvalid(a_tvalid), .i0_tready(a_tready),
+ .i1_tlast(b_tlast), .i1_tvalid(b_tvalid), .i1_tready(b_tready),
+ .o_tlast(p_int_tlast), .o_tvalid(p_int_tvalid), .o_tready(p_int_tready),
+ .enables0(en_a_reg), .enables1(en_b_reg), .enables_post(en_p_reg));
+
+ // Multiply
+ wire [WIDTH_A+WIDTH_B-1:0] p_mult_signed = (LATENCY == 0) ? $signed(a_tdata) * $signed(b_tdata) : $signed(a_reg[A_LATENCY-1]) * $signed(b_reg[B_LATENCY-1]);
+ wire [WIDTH_A+WIDTH_B-1:0] p_mult_unsigned = (LATENCY == 0) ? a_tdata * b_tdata : a_reg[A_LATENCY-1] * b_reg[B_LATENCY-1];
+ wire [WIDTH_A+WIDTH_B-1:0] p_int_tdata = (LATENCY == 0) ? (SIGNED ? p_mult_signed : p_mult_unsigned) : p_reg[P_LATENCY-1];
+
+ // Register pipeline
+ integer i;
+ always @(posedge clk) begin
+ if (reset) begin
+ for (i = 0; i < A_LATENCY; i = i + 1) begin
+ a_reg[i] <= 'd0;
+ end
+ for (i = 0; i < B_LATENCY; i = i + 1) begin
+ b_reg[i] <= 'd0;
+ end
+ for (i = 0; i < P_LATENCY; i = i + 1) begin
+ p_reg[i] <= 'd0;
+ end
+ end else begin
+ for (i = 0; i < A_LATENCY; i = i + 1) begin
+ if (en_a_reg[i]) begin
+ if (i == 0) begin
+ a_reg[i] <= $signed(a_tdata);
+ end else begin
+ a_reg[i] <= a_reg[i-1];
+ end
+ end
+ end
+ for (i = 0; i < B_LATENCY; i = i + 1) begin
+ if (en_b_reg[i]) begin
+ if (i == 0) begin
+ b_reg[i] <= $signed(b_tdata);
+ end else begin
+ b_reg[i] <= b_reg[i-1];
+ end
+ end
+ end
+ for (i = 0; i < P_LATENCY; i = i + 1) begin
+ if (en_p_reg[i]) begin
+ if (i == 0) begin
+ p_reg[i] <= SIGNED ? p_mult_signed : p_mult_unsigned;
+ end else begin
+ p_reg[i] <= p_reg[i-1];
+ end
+ end
+ end
+ end
+ end
+
+ // Saturate & Round
+ // TODO: Might be able to replace axi_round with DSP's built in rounding
+ generate
+ if ((EN_SATURATE == 1) && (EN_ROUND == 1)) begin
+ axi_round_and_clip #(
+ .WIDTH_IN(WIDTH_A+WIDTH_B),
+ .WIDTH_OUT(WIDTH_P),
+ .CLIP_BITS(DROP_TOP_P))
+ axi_round_and_clip (
+ .clk(clk), .reset(reset),
+ .i_tdata(p_int_tdata), .i_tlast(p_int_tlast), .i_tvalid(p_int_tvalid), .i_tready(p_int_tready),
+ .o_tdata(p_tdata), .o_tlast(p_tlast), .o_tvalid(p_tvalid), .o_tready(p_tready));
+ end else if ((EN_SATURATE == 0) && (EN_ROUND == 1)) begin
+ axi_round #(
+ .WIDTH_IN(WIDTH_A+WIDTH_B-DROP_TOP_P),
+ .WIDTH_OUT(WIDTH_P))
+ axi_round (
+ .clk(clk), .reset(reset),
+ .i_tdata(p_int_tdata[WIDTH_A+WIDTH_B-DROP_TOP_P-1:0]), .i_tlast(p_int_tlast), .i_tvalid(p_int_tvalid), .i_tready(p_int_tready),
+ .o_tdata(p_tdata), .o_tlast(p_tlast), .o_tvalid(p_tvalid), .o_tready(p_tready));
+ end else if ((EN_SATURATE == 1) && (EN_ROUND == 0)) begin
+ wire [WIDTH_A+WIDTH_B-DROP_TOP_P-1:0] p_clip_tdata;
+ axi_clip #(
+ .WIDTH_IN(WIDTH_A+WIDTH_B),
+ .WIDTH_OUT(WIDTH_A+WIDTH_B-DROP_TOP_P),
+ .CLIP_BITS(DROP_TOP_P))
+ axi_clip (
+ .clk(clk), .reset(reset),
+ .i_tdata(p_int_tdata), .i_tlast(p_int_tlast), .i_tvalid(p_int_tvalid), .i_tready(p_int_tready),
+ .o_tdata(p_clip_tdata), .o_tlast(p_tlast), .o_tvalid(p_tvalid), .o_tready(p_tready));
+ assign p_tdata = p_clip_tdata[WIDTH_A+WIDTH_B-DROP_TOP_P-1:WIDTH_A+WIDTH_B-DROP_TOP_P-WIDTH_P];
+ end else begin
+ assign p_tdata = p_int_tdata[WIDTH_A+WIDTH_B-DROP_TOP_P-1:WIDTH_A+WIDTH_B-DROP_TOP_P-WIDTH_P];
+ assign p_tlast = p_int_tlast;
+ assign p_tvalid = p_int_tvalid;
+ assign p_int_tready = p_tready;
+ end
+ endgenerate
+
+endmodule