//
// Copyright 2015 Ettus Research, a National Instruments Company
//
// SPDX-License-Identifier: LGPL-3.0-or-later
//

//
// There are various obligations put on this code not present in regular BRAM based FIFO's
//
// 1) Bursts are way more efficient, use local small FIFO's to interact with DRAM
// 2) Never cross a 4KByte address boundary within a single transaction, this is an AXI4 rule.
// 3) 2^SIZE must be greater than 4KB so that the 4KByte page protection also deals with FIFO wrap corner case.
//
module axi_dma_fifo 
#(
   parameter SIMULATION       = 0,             // Shorten flush counter for simulation
   parameter DEFAULT_BASE     = 30'h00000000,
   parameter DEFAULT_MASK     = 30'hFF000000,
   parameter DEFAULT_TIMEOUT  = 12'd256,
   parameter BUS_CLK_RATE     = 32'd166666666,  // Frequency in Hz of bus_clk
   parameter SR_BASE          = 0,              // Base address for settings registers
   parameter EXT_BIST         = 0,              // If 1 then instantiate extended BIST with dynamic SID, delays and BW counters
   parameter MAX_PKT_LEN      = 12              // Log2 of maximum packet length
) (
   input bus_clk,
   input bus_reset, 
   input dram_clk,
   input dram_reset,
   //
   // AXI Write address channel
   //
   output [0 : 0] m_axi_awid,     // Write address ID. This signal is the identification tag for the write address signals
   output [31 : 0] m_axi_awaddr,  // Write address. The write address gives the address of the first transfer in a write burst
   output [7 : 0] m_axi_awlen,    // Burst length. The burst length gives the exact number of transfers in a burst.
   output [2 : 0] m_axi_awsize,   // Burst size. This signal indicates the size of each transfer in the burst. 
   output [1 : 0] m_axi_awburst,  // Burst type. The burst type and the size information, determine how the address is calculated
   output [0 : 0] m_axi_awlock,   // Lock type. Provides additional information about the atomic characteristics of the transfer.
   output [3 : 0] m_axi_awcache,  // Memory type. This signal indicates how transactions are required to progress
   output [2 : 0] m_axi_awprot,   // Protection type. This signal indicates the privilege and security level of the transaction
   output [3 : 0] m_axi_awqos,    // Quality of Service, QoS. The QoS identifier sent for each write transaction
   output [3 : 0] m_axi_awregion, // Region identifier. Permits a single physical interface on a slave to be re-used.
   output [0 : 0] m_axi_awuser,   // User signal. Optional User-defined signal in the write address channel.
   output m_axi_awvalid,          // Write address valid. This signal indicates that the channel is signaling valid write addr
   input m_axi_awready,           // Write address ready. This signal indicates that the slave is ready to accept an address
   //
   // AXI Write data channel.
   //
   output [63 : 0] m_axi_wdata,   // Write data
   output [7 : 0] m_axi_wstrb,    // Write strobes. This signal indicates which byte lanes hold valid data.
   output m_axi_wlast,            // Write last. This signal indicates the last transfer in a write burst
   output [0 : 0] m_axi_wuser,    // User signal. Optional User-defined signal in the write data channel.
   output m_axi_wvalid,           // Write valid. This signal indicates that valid write data and strobes are available. 
   input m_axi_wready,            // Write ready. This signal indicates that the slave can accept the write data.
   //
   // AXI Write response channel signals
   //
   input [0 : 0] m_axi_bid,       // Response ID tag. This signal is the ID tag of the write response. 
   input [1 : 0] m_axi_bresp,     // Write response. This signal indicates the status of the write transaction.
   input [0 : 0] m_axi_buser,     // User signal. Optional User-defined signal in the write response channel.
   input m_axi_bvalid,            // Write response valid. This signal indicates that the channel is signaling a valid response
   output m_axi_bready,           // Response ready. This signal indicates that the master can accept a write response
   //
   // AXI Read address channel
   //
   output [0 : 0] m_axi_arid,     // Read address ID. This signal is the identification tag for the read address group of signals
   output [31 : 0] m_axi_araddr,  // Read address. The read address gives the address of the first transfer in a read burst
   output [7 : 0] m_axi_arlen,    // Burst length. This signal indicates the exact number of transfers in a burst.
   output [2 : 0] m_axi_arsize,   // Burst size. This signal indicates the size of each transfer in the burst.
   output [1 : 0] m_axi_arburst,  // Burst type. The burst type and the size information determine how the address for each transfer
   output [0 : 0] m_axi_arlock,   // Lock type. This signal provides additional information about the atomic characteristics
   output [3 : 0] m_axi_arcache,  // Memory type. This signal indicates how transactions are required to progress 
   output [2 : 0] m_axi_arprot,   // Protection type. This signal indicates the privilege and security level of the transaction
   output [3 : 0] m_axi_arqos,    // Quality of Service, QoS. QoS identifier sent for each read transaction.
   output [3 : 0] m_axi_arregion, // Region identifier. Permits a single physical interface on a slave to be re-used
   output [0 : 0] m_axi_aruser,   // User signal. Optional User-defined signal in the read address channel.
   output m_axi_arvalid,          // Read address valid. This signal indicates that the channel is signaling valid read addr
   input m_axi_arready,           // Read address ready. This signal indicates that the slave is ready to accept an address
   //
   // AXI Read data channel
   //
   input [0 : 0] m_axi_rid,       // Read ID tag. This signal is the identification tag for the read data group of signals
   input [63 : 0] m_axi_rdata,    // Read data.
   input [1 : 0] m_axi_rresp,     // Read response. This signal indicates the status of the read transfer
   input m_axi_rlast,             // Read last. This signal indicates the last transfer in a read burst.
   input [0 : 0] m_axi_ruser,     // User signal. Optional User-defined signal in the read data channel.
   input m_axi_rvalid,            // Read valid. This signal indicates that the channel is signaling the required read data. 
   output m_axi_rready,           // Read ready. This signal indicates that the master can accept the read data and response
   //
   // CHDR friendly AXI stream input
   //
   input [63:0] i_tdata,
   input i_tlast,
   input i_tvalid,
   output i_tready,
   //
   // CHDR friendly AXI Stream output
   //
   output [63:0] o_tdata,
   output o_tlast,
   output o_tvalid,
   input o_tready,
   //
   // Settings and Readback
   //
   input              set_stb,
   input [7:0]        set_addr,
   input [31:0]       set_data,
   output reg [31:0]  rb_data,
   //
   // Debug Bus
   //
   output [197:0] debug
);

   //
   // We are only solving for width 64bits here, since it's our standard CHDR quanta
   //
   localparam DWIDTH = 64;
   localparam AWIDTH = 30;  //Can address 1GiB of memory

   //
   // Settings and Readback
   //
   wire [2:0]         rb_addr;
   wire               clear_bclk, flush_bclk;
   wire               supress_enable_bclk;
   wire [15:0]        supress_threshold_bclk;
   wire [11:0]        timeout_bclk;
   wire [AWIDTH-1:0]  fifo_base_addr_bclk;
   wire [AWIDTH-1:0]  fifo_addr_mask_bclk;
   wire [0:0]         ctrl_reserved;

   wire [31:0]  rb_fifo_status;
   wire [3:0]   rb_bist_status;
   wire [95:0]  rb_bist_bw_ratio;
   reg  [31:0]  out_pkt_count = 32'd0;

   localparam RB_FIFO_STATUS    = 3'd0;
   localparam RB_BIST_STATUS    = 3'd1;
   localparam RB_BIST_XFER_CNT  = 3'd2;
   localparam RB_BIST_CYC_CNT   = 3'd3;
   localparam RB_BUS_CLK_RATE   = 3'd4;
   localparam RB_OUT_PKT_CNT    = 3'd5;

   // SETTING: Readback Address Register
   // Fields:
   // - [2:0]  : Address for readback register
   //            - 0 = RB_FIFO_STATUS
   //            - 1 = RB_BIST_STATUS
   //            - 2 = RB_BIST_XFER_CNT
   //            - 3 = RB_BIST_CYC_CNT
   //            - 4 = RB_BUS_CLK_RATE
   //            - rest reserved
   setting_reg #(.my_addr(SR_BASE + 0), .awidth(8), .width(3), .at_reset(3'b000)) sr_readback
     (.clk(bus_clk), .rst(bus_reset),
      .strobe(set_stb), .addr(set_addr), .in(set_data),
      .out(rb_addr), .changed());

   // SETTING: FIFO Control Register
   // Fields:
   // - [0]     : Clear FIFO and discard stored data
   // - [1]     : Enable read suppression to prioritize writes
   // - [2]     : Flush all packets from the FIFO
   // - [3]     : Reserved
   // - [15:4]  : Timeout (in memory clock beats) for issuing smaller than optimal bursts
   // - [31:16] : Read suppression threshold in number of words
   setting_reg #(.my_addr(SR_BASE + 1), .awidth(8), .width(32), .at_reset({16'h0, DEFAULT_TIMEOUT[11:0], 1'b0, 1'b0, 1'b0, 1'b1})) sr_fifo_ctrl
     (.clk(bus_clk), .rst(bus_reset),
      .strobe(set_stb), .addr(set_addr), .in(set_data),
      .out({supress_threshold_bclk, timeout_bclk, ctrl_reserved, flush_bclk, supress_enable_bclk, clear_bclk}), .changed());

   // SETTING: Base Address for FIFO in memory space
   // Fields:
   // - [29:0]  : Base address
   setting_reg #(.my_addr(SR_BASE + 2), .awidth(8), .width(AWIDTH), .at_reset(DEFAULT_BASE)) sr_fifo_base_addr
     (.clk(bus_clk), .rst(bus_reset),
      .strobe(set_stb), .addr(set_addr), .in(set_data),
      .out(fifo_base_addr_bclk), .changed());

   // SETTING: Address Mask for FIFO in memory space. The mask is ANDed with the base address to define
   //          a unique address for this FIFO. A zero in the mask signifies that the DRAM FIFO can
   //          utilize the address bit internally for maintaining FIFO data
   // Fields:
   // - [29:0]  : Address mask
   setting_reg #(.my_addr(SR_BASE + 3), .awidth(8), .width(AWIDTH), .at_reset(DEFAULT_MASK)) sr_fifo_addr_mask
     (.clk(bus_clk), .rst(bus_reset),
      .strobe(set_stb), .addr(set_addr), .in(set_data),
      .out(fifo_addr_mask_bclk), .changed());

   always @(*) begin
      case(rb_addr)
         RB_FIFO_STATUS:      rb_data = rb_fifo_status;
         RB_BIST_STATUS:      rb_data = {(EXT_BIST?1'b1:1'b0), 27'h0, rb_bist_status};
         RB_BIST_XFER_CNT:    rb_data = rb_bist_bw_ratio[79:48];
         RB_BIST_CYC_CNT:     rb_data = rb_bist_bw_ratio[31:0];
         RB_BUS_CLK_RATE:     rb_data = BUS_CLK_RATE;
         RB_OUT_PKT_CNT:      rb_data = out_pkt_count;
         default:             rb_data = 32'h0;
      endcase
   end

   //
   // Synchronize settings register values to dram_clk
   //
   wire clear;
   synchronizer #(.INITIAL_VAL(1'b1)) clear_sync_inst (.clk(dram_clk), .rst(1'b0), .in(clear_bclk), .out(clear));

   wire               set_suppress_en;
   wire [15:0]        set_supress_threshold;
   wire [11:0]        set_timeout;
   wire [AWIDTH-1:0]  set_fifo_base_addr, set_fifo_addr_mask, set_fifo_addr_mask_bar;

   wire [(72-AWIDTH-29-1):0]  set_sync_discard0;
   wire [(72-(2*AWIDTH)-1):0] set_sync_discard1;
   fifo_short_2clk set_sync_fifo0(
      .rst(bus_reset),
      .wr_clk(bus_clk), .din({{(72-AWIDTH-29){1'b0}}, timeout_bclk, supress_enable_bclk, supress_threshold_bclk, fifo_base_addr_bclk}),
      .wr_en(1'b1), .full(), .wr_data_count(),
      .rd_clk(dram_clk), .dout({set_sync_discard0, set_timeout, set_suppress_en, set_supress_threshold, set_fifo_base_addr}),
      .rd_en(1'b1), .empty(), .rd_data_count()
   );
   fifo_short_2clk set_sync_fifo1(
      .rst(bus_reset),
      .wr_clk(bus_clk), .din({{(72-(2*AWIDTH)){1'b0}}, ~fifo_addr_mask_bclk, fifo_addr_mask_bclk}),
      .wr_en(1'b1), .full(), .wr_data_count(),
      .rd_clk(dram_clk), .dout({set_sync_discard1, set_fifo_addr_mask_bar, set_fifo_addr_mask}),
      .rd_en(1'b1), .empty(), .rd_data_count()
   );

   //
   // Input side declarations
   //
   localparam [2:0] INPUT_IDLE = 0;
   localparam [2:0] INPUT1 = 1;
   localparam [2:0] INPUT2 = 2;
   localparam [2:0] INPUT3 = 3;   
   localparam [2:0] INPUT4 = 4;
   localparam [2:0] INPUT5 = 5;
   localparam [2:0] INPUT6 = 6;

   reg [2:0]   input_state;
   reg         input_timeout_triggered;
   reg         input_timeout_reset;
   reg [8:0]   input_timeout_count;
   reg [AWIDTH-1:0]  write_addr;
   reg         write_ctrl_valid;
   wire        write_ctrl_ready;
   reg [7:0]   write_count = 8'd0;
   reg [8:0]   write_count_plus_one = 9'd1;  // Maintain a +1 version to break critical timing paths
   reg         update_write;

   //
   // Output side declarations
   //
   localparam [2:0] OUTPUT_IDLE = 0;
   localparam [2:0] OUTPUT1 = 1;
   localparam [2:0] OUTPUT2 = 2;
   localparam [2:0] OUTPUT3 = 3;   
   localparam [2:0] OUTPUT4 = 4;
   localparam [2:0] OUTPUT5 = 5;
   localparam [2:0] OUTPUT6 = 6;

   reg [2:0]   output_state;
   reg         output_timeout_triggered;
   reg         output_timeout_reset;
   reg [8:0]   output_timeout_count;
   reg [AWIDTH-1:0]  read_addr;
   reg         read_ctrl_valid;
   wire        read_ctrl_ready;
   reg [7:0]   read_count = 8'd0; 
   reg [8:0]   read_count_plus_one = 9'd1;  // Maintain a +1 version to break critical timing paths
   reg         update_read;
   
   // Track main FIFO active size.
   reg [AWIDTH-3:0] space, occupied, occupied_minus_one;  // Maintain a -1 version to break critical timing paths
   reg [AWIDTH-3:0] input_page_boundry, output_page_boundry;  // Cache in a register to break critical timing paths

   // Assign FIFO status bits
   wire [71:0] status_out_bclk;
   fifo_short_2clk status_fifo_2clk(
      .rst(dram_reset),
      .wr_clk(dram_clk), .din({{(72-(AWIDTH-2)){1'b0}}, occupied}),
      .wr_en(1'b1), .full(), .wr_data_count(),
      .rd_clk(bus_clk), .dout(status_out_bclk),
      .rd_en(1'b1), .empty(), .rd_data_count()
   );
   assign rb_fifo_status[31]    = 1'b1;   //DRAM FIFO signature (validates existence of DRAM FIFO)
   assign rb_fifo_status[30:27] = {o_tvalid, o_tready, i_tvalid, i_tready};   //Ready valid flags
   assign rb_fifo_status[26:0]  = status_out_bclk[26:0];   //FIFO fullness count in 64bit words (max 27 bits = 1GiB)

   ///////////////////////////////////////////////////////////////////////////////
   // Inline BIST for production testing
   //
   wire       i_tready_int;

   wire [DWIDTH-1:0] i_tdata_fifo;
   wire       i_tvalid_fifo, i_tready_fifo, i_tlast_fifo;

   wire [DWIDTH-1:0] i_tdata_bist;
   wire       i_tvalid_bist, i_tready_bist, i_tlast_bist;

   wire [DWIDTH-1:0] o_tdata_int;
   wire       o_tvalid_int, o_tready_int, o_tlast_int;

   wire [DWIDTH-1:0] o_tdata_fifo;
   wire       o_tvalid_fifo, o_tready_fifo, o_tlast_fifo;

   wire [DWIDTH-1:0] o_tdata_bist;
   wire       o_tvalid_bist, o_tready_bist, o_tlast_bist;

   wire [DWIDTH-1:0] o_tdata_gate;
   wire       o_tvalid_gate, o_tready_gate, o_tlast_gate;

   axi_mux4 #(.PRIO(1), .WIDTH(DWIDTH), .BUFFER(1)) axi_mux (
      .clk(bus_clk), .reset(bus_reset), .clear(clear_bclk),
      .i0_tdata(i_tdata), .i0_tlast(i_tlast), .i0_tvalid(i_tvalid), .i0_tready(i_tready_int),
      .i1_tdata(i_tdata_bist), .i1_tlast(i_tlast_bist), .i1_tvalid(i_tvalid_bist), .i1_tready(i_tready_bist),
      .i2_tdata({DWIDTH{1'b0}}), .i2_tlast(1'b0), .i2_tvalid(1'b0), .i2_tready(),
      .i3_tdata({DWIDTH{1'b0}}), .i3_tlast(1'b0), .i3_tvalid(1'b0), .i3_tready(),
      .o_tdata(i_tdata_fifo), .o_tlast(i_tlast_fifo), .o_tvalid(i_tvalid_fifo), .o_tready(i_tready_fifo)
   );
   assign i_tready = i_tready_int & (~clear_bclk);

   wire       bist_running, bist_done;
   wire [1:0] bist_error;
   
   axi_chdr_test_pattern #(
     .DELAY_MODE(EXT_BIST ? "DYNAMIC" : "STATIC"), 
     .SID_MODE(EXT_BIST ? "DYNAMIC" : "STATIC"), 
     .BW_COUNTER(EXT_BIST ? 1 : 0),
     .SR_BASE(SR_BASE + 4)
   ) axi_chdr_test_pattern_i (
      .clk(bus_clk), .reset(bus_reset | clear_bclk),
      .i_tdata(i_tdata_bist), .i_tlast(i_tlast_bist), .i_tvalid(i_tvalid_bist), .i_tready(i_tready_bist),
      .o_tdata(o_tdata_bist), .o_tlast(o_tlast_bist), .o_tvalid(o_tvalid_bist), .o_tready(o_tready_bist),
      .set_stb(set_stb), .set_addr(set_addr), .set_data(set_data),
      .running(bist_running), .done(bist_done), .error(bist_error), .status_vtr(), .bw_ratio(rb_bist_bw_ratio)
   );
   assign rb_bist_status = {bist_error, bist_done, bist_running};

   axi_demux4 #(.ACTIVE_CHAN(4'b0011), .WIDTH(DWIDTH)) axi_demux(
      .clk(bus_clk), .reset(bus_reset), .clear(clear_bclk),
      .header(), .dest({1'b0, bist_running}),
      .i_tdata(o_tdata_fifo), .i_tlast(o_tlast_fifo), .i_tvalid(o_tvalid_fifo), .i_tready(o_tready_fifo),
      .o0_tdata(o_tdata_gate), .o0_tlast(o_tlast_gate), .o0_tvalid(o_tvalid_gate), .o0_tready(o_tready_gate),
      .o1_tdata(o_tdata_bist), .o1_tlast(o_tlast_bist), .o1_tvalid(o_tvalid_bist), .o1_tready(o_tready_bist),
      .o2_tdata(), .o2_tlast(), .o2_tvalid(), .o2_tready(1'b0),
      .o3_tdata(), .o3_tlast(), .o3_tvalid(), .o3_tready(1'b0)
   );

   //Insert package gate before output to absorb any intra-packet bubble cycles
   axi_packet_gate #(.WIDTH(DWIDTH), .SIZE(MAX_PKT_LEN)) out_pkt_gate (
      .clk(bus_clk), .reset(bus_reset), .clear(clear_bclk),
      .i_tdata(o_tdata_gate), .i_tlast(o_tlast_gate), .i_tvalid(o_tvalid_gate), .i_tready(o_tready_gate),
      .i_terror(1'b0),
      .o_tdata(o_tdata_int), .o_tlast(o_tlast_int), .o_tvalid(o_tvalid_int), .o_tready(o_tready_int)
   );

   axis_packet_flush #(
      .WIDTH(DWIDTH), .FLUSH_PARTIAL_PKTS(0), .TIMEOUT_W(1), .PIPELINE("NONE")
   ) flusher_i (
      .clk(bus_clk), .reset(bus_reset),
      .enable(clear_bclk | flush_bclk), .timeout(1'b0), .flushing(), .done(),
      .s_axis_tdata(o_tdata_int), .s_axis_tlast(o_tlast_int),
      .s_axis_tvalid(o_tvalid_int), .s_axis_tready(o_tready_int),
      .m_axis_tdata(o_tdata), .m_axis_tlast(o_tlast),
      .m_axis_tvalid(o_tvalid), .m_axis_tready(o_tready)
   );

   always @(posedge bus_clk) begin
      if (bus_reset) begin
        out_pkt_count <= 32'd0;
      end else if (o_tlast_int & o_tvalid_int & o_tready_int) begin
        out_pkt_count <= out_pkt_count + 32'd1;
      end
   end

   //
   // Buffer input in FIFO's. Embeded tlast signal using ESCape code.
   //

   wire [DWIDTH-1:0] i_tdata_i0;
   wire             i_tvalid_i0, i_tready_i0, i_tlast_i0;
 
   wire [DWIDTH-1:0] i_tdata_i1;
   wire             i_tvalid_i1, i_tready_i1, i_tlast_i1;

   wire [DWIDTH-1:0] i_tdata_i2;
   wire             i_tvalid_i2, i_tready_i2;

   wire [DWIDTH-1:0] i_tdata_i3;
   wire             i_tvalid_i3, i_tready_i3;

   wire [DWIDTH-1:0] i_tdata_input;
   wire             i_tvalid_input, i_tready_input;
   wire [15:0]      space_input, occupied_input;
   reg [15:0]       space_input_reg;
   reg              supress_reads;


   ///////////////////////////////////////////////////////////////////////////////
   
   wire         write_in, read_in, empty_in, full_in;
   assign       i_tready_fifo = ~full_in;
   assign       write_in = i_tvalid_fifo & i_tready_fifo;
   assign       i_tvalid_i0 = ~empty_in;
   assign       read_in = i_tvalid_i0 & i_tready_i0;
   wire [6:0]   discard_i0;
   
   fifo_short_2clk fifo_short_2clk_i0 (
      .rst(bus_reset),
      .wr_clk(bus_clk),
      .din({7'h0,i_tlast_fifo,i_tdata_fifo}), // input [71 : 0] din
      .wr_en(write_in), // input wr_en
      .full(full_in), // output full
      .wr_data_count(), // output [9 : 0] wr_data_count

      .rd_clk(dram_clk), // input rd_clk
      .dout({discard_i0,i_tlast_i0,i_tdata_i0}), // output [71 : 0] dout
      .rd_en(read_in), // input rd_en
      .empty(empty_in), // output empty
      .rd_data_count()  // output [9 : 0] rd_data_count
   );

   axi_fifo_flop2 #(.WIDTH(DWIDTH+1)) input_pipe_i0
     (
      .clk(dram_clk), 
      .reset(dram_reset), 
      .clear(clear),
      //
      .i_tdata({i_tlast_i0, i_tdata_i0}), 
      .i_tvalid(i_tvalid_i0), 
      .i_tready(i_tready_i0),
      //
      .o_tdata({i_tlast_i1, i_tdata_i1}), 
      .o_tvalid(i_tvalid_i1), 
      .o_tready(i_tready_i1)
   );

   axi_embed_tlast #(.WIDTH(DWIDTH), .ADD_CHECKSUM(0)) axi_embed_tlast_i (
      .clk(dram_clk),
      .reset(dram_reset),
      .clear(clear),
      //
      .i_tdata(i_tdata_i1),
      .i_tlast(i_tlast_i1),
      .i_tvalid(i_tvalid_i1),
      .i_tready(i_tready_i1),
      //
      .o_tdata(i_tdata_i2),
      .o_tvalid(i_tvalid_i2),
      .o_tready(i_tready_i2)
   );

   axi_fifo_flop2 #(.WIDTH(DWIDTH)) input_pipe_i1 (
      .clk(dram_clk), 
      .reset(dram_reset), 
      .clear(clear),
      //
      .i_tdata(i_tdata_i2), 
      .i_tvalid(i_tvalid_i2), 
      .i_tready(i_tready_i2),
      //
      .o_tdata(i_tdata_i3), 
      .o_tvalid(i_tvalid_i3), 
      .o_tready(i_tready_i3)
   );

   axi_fifo #(.WIDTH(DWIDTH),.SIZE(10)) fifo_i1 (
      .clk(dram_clk), 
      .reset(dram_reset), 
      .clear(clear),
      //
      .i_tdata(i_tdata_i3), 
      .i_tvalid(i_tvalid_i3), 
      .i_tready(i_tready_i3),
      //
      .o_tdata(i_tdata_input), 
      .o_tvalid(i_tvalid_input), 
      .o_tready(i_tready_input),
      //
      .space(space_input), 
      .occupied(occupied_input)
   );

   //
   // Monitor occupied_input to deduce when DRAM FIFO is running short of bandwidth and there is a danger of backpressure
   // passing upstream of the DRAM FIFO.
   // In this situation supress read requests to the DRAM FIFO so that more bandwidth is available to writes.
   //
   always @(posedge dram_clk) 
      begin
         space_input_reg <= space_input;
         if ((space_input_reg < set_supress_threshold[15:0])  && set_suppress_en)
            supress_reads <= 1'b1;
         else 
            supress_reads <= 1'b0;
      end

   //
   // Buffer output in 32entry FIFO's. Extract embeded tlast signal.
   //
   wire [DWIDTH-1:0] o_tdata_output;
   wire             o_tvalid_output, o_tready_output;
   wire [15:0]      space_output, occupied_output;

   wire [DWIDTH-1:0] o_tdata_i0;
   wire             o_tvalid_i0, o_tready_i0;
   
   wire [DWIDTH-1:0] o_tdata_i1;
   wire             o_tvalid_i1, o_tready_i1;
   
   wire [DWIDTH-1:0] o_tdata_i2;
   wire             o_tvalid_i2, o_tready_i2;
   
   wire [DWIDTH-1:0] o_tdata_i3;
   wire             o_tvalid_i3, o_tready_i3;
   
   wire [DWIDTH-1:0] o_tdata_i4;
   wire             o_tvalid_i4, o_tready_i4, o_tlast_i4;

   wire [DWIDTH-1:0] o_tdata_i5;
   wire             o_tvalid_i5, o_tready_i5, o_tlast_i5;

   wire             checksum_error;

   axi_fifo #(.WIDTH(DWIDTH),.SIZE(10)) fifo_i2 (
      .clk(dram_clk), 
      .reset(dram_reset), 
      .clear(clear),
      //
      .i_tdata(o_tdata_output), 
      .i_tvalid(o_tvalid_output), 
      .i_tready(o_tready_output),
      //
      .o_tdata(o_tdata_i0), 
      .o_tvalid(o_tvalid_i0), 
      .o_tready(o_tready_i0),
      //
      .space(space_output), 
      .occupied(occupied_output)
   );

   // Place FLops straight after SRAM read access for timing.
   axi_fifo_flop2 #(.WIDTH(DWIDTH)) output_pipe_i0
     (
      .clk(dram_clk), 
      .reset(dram_reset), 
      .clear(clear),
      //
      .i_tdata(o_tdata_i0), 
      .i_tvalid(o_tvalid_i0), 
      .i_tready(o_tready_i0),
      //
      .o_tdata(o_tdata_i1), 
      .o_tvalid(o_tvalid_i1), 
      .o_tready(o_tready_i1 && ~supress_reads)
   );

   // Read suppression logic
   // The CL part of this exists between these
   // axi_flops 
   axi_fifo_flop2 #(.WIDTH(DWIDTH)) output_pipe_i1
     (
      .clk(dram_clk), 
      .reset(dram_reset), 
      .clear(clear),
      //
      .i_tdata(o_tdata_i1), 
      .i_tvalid(o_tvalid_i1 && ~supress_reads), 
      .i_tready(o_tready_i1),
      //
      .o_tdata(o_tdata_i2), 
      .o_tvalid(o_tvalid_i2), 
      .o_tready(o_tready_i2)
   );

   // Pipeline flop before tlast extraction logic
   axi_fifo_flop2 #(.WIDTH(DWIDTH)) output_pipe_i2
     (
      .clk(dram_clk), 
      .reset(dram_reset), 
      .clear(clear),
      //
      .i_tdata(o_tdata_i2), 
      .i_tvalid(o_tvalid_i2), 
      .i_tready(o_tready_i2),
      //
      .o_tdata(o_tdata_i3), 
      .o_tvalid(o_tvalid_i3), 
      .o_tready(o_tready_i3)
   );

    axi_extract_tlast #(.WIDTH(DWIDTH), .VALIDATE_CHECKSUM(0)) axi_extract_tlast_i (
      .clk(dram_clk),
      .reset(dram_reset),
      .clear(clear),
      //
      .i_tdata(o_tdata_i3),
      .i_tvalid(o_tvalid_i3),
      .i_tready(o_tready_i3),
      //
      .o_tdata(o_tdata_i4),
      .o_tlast(o_tlast_i4),
      .o_tvalid(o_tvalid_i4),
      .o_tready(o_tready_i4),
      //
      .checksum_error()
   );

   // Pipeline flop after tlast extraction logic
   axi_fifo_flop2 #(.WIDTH(DWIDTH+1)) output_pipe_i3
     (
      .clk(dram_clk), 
      .reset(dram_reset), 
      .clear(clear),
      //
      .i_tdata({o_tlast_i4,o_tdata_i4}), 
      .i_tvalid(o_tvalid_i4), 
      .i_tready(o_tready_i4),
      //
      .o_tdata({o_tlast_i5,o_tdata_i5}), 
      .o_tvalid(o_tvalid_i5), 
      .o_tready(o_tready_i5)
   );

   wire         write_out, read_out, empty_out, full_out;
   assign       o_tready_i5 = ~full_out;
   assign       write_out = o_tvalid_i5 & o_tready_i5;
   assign       o_tvalid_fifo = ~empty_out;
   assign       read_out = o_tvalid_fifo & o_tready_fifo;
   wire [6:0]   discard_i1;
   
   fifo_short_2clk fifo_short_2clk_i1 (
      .rst(dram_reset),
      .wr_clk(dram_clk),
      .din({7'h0,o_tlast_i5,o_tdata_i5}), // input [71 : 0] din
      .wr_en(write_out), // input wr_en
      .full(full_out), // output full
      .wr_data_count(), // output [9 : 0] wr_data_count

      .rd_clk(bus_clk), // input rd_clk
      .dout({discard_i1,o_tlast_fifo,o_tdata_fifo}), // output [71 : 0] dout
      .rd_en(read_out), // input rd_en
      .empty(empty_out), // output empty
      .rd_data_count()  // output [9 : 0] rd_data_count
   );

   //
   // Simple input timeout counter for now.
   // Timeout count only increments when there is some data waiting to be written.
   //
   always @(posedge dram_clk)
      if (dram_reset | clear) begin
         input_timeout_count <= 9'd0;
         input_timeout_triggered <= 1'b0;
      end else if (input_timeout_reset) begin
         input_timeout_count <= 9'd0;
         input_timeout_triggered <= 1'b0;
     end else if (input_timeout_count == set_timeout[8:0]) begin
         input_timeout_triggered <= 1'b1;
     end else if (input_state == INPUT_IDLE) begin
         input_timeout_count <= input_timeout_count + ((occupied_input != 16'd0) ? 9'd1 : 9'd0);
     end

   //
   // Wait for 16 entries in input FIFO to trigger DRAM write burst.
   // Timeout can also trigger burst so fragments of data are not left to rot in the input FIFO.
   // Also if enough data is present in the input FIFO to complete a burst upto the edge
   // of a 4KByte page then immediately start the burst.
   //
   always @(posedge dram_clk)
      if (dram_reset | clear) begin
         input_state <= INPUT_IDLE;
         write_addr <= set_fifo_base_addr & set_fifo_addr_mask;
         input_timeout_reset <= 1'b0;
         write_ctrl_valid <= 1'b0;
         write_count <= 8'd0;
         write_count_plus_one <= 9'd1;
         update_write <= 1'b0;
      end else
         case (input_state)
         //
         // INPUT_IDLE.
         // To start an input transfer to DRAM need:
         // 1) Space in the DRAM FIFO 
         // and either
         // 2) 256 entrys in the input FIFO
         // or
         // 3) Timeout waiting for more data.
         //
         INPUT_IDLE: begin
            write_ctrl_valid <= 1'b0;
            update_write <= 1'b0;
            if (space[AWIDTH-3:8] != 'd0) begin // (space > 255): Space in the DRAM FIFO
               if (occupied_input[15:8] != 'd0) begin  // (occupied_input > 255): 256 or more entries in input FIFO
                  input_state <= INPUT1;
                  input_timeout_reset <= 1'b1;
                  // Calculate number of entries remaining until next 4KB page boundry is crossed minus 1.
                  // Note, units of calculation are 64bit wide words. Address is always 64bit alligned.
                  input_page_boundry <= {write_addr[AWIDTH-1:12],9'h1ff} - write_addr[AWIDTH-1:3];   
               end else if (input_timeout_triggered) begin // input FIFO timeout waiting for new data.
                  input_state <= INPUT2;
                  input_timeout_reset <= 1'b1;
                  // Calculate number of entries remaining until next 4KB page boundry is crossed minus 1.
                  // Note, units of calculation are 64bit wide words. Address is always 64bit alligned.
                  input_page_boundry <= {write_addr[AWIDTH-1:12],9'h1ff} - write_addr[AWIDTH-1:3];   
               end else begin
                  input_timeout_reset <= 1'b0;
                  input_state <= INPUT_IDLE;
               end
            end else begin
               input_timeout_reset <= 1'b0;
               input_state <= INPUT_IDLE;
            end
         end
         //
         // INPUT1.
         // Caused by input FIFO reaching 256 entries.
         // Request write burst of lesser of:
         // 1) Entrys until page boundry crossed
         // 2) 256.
         //
         INPUT1: begin
            // Replicated write logic to break a read timing critical path for write_count
            write_count <= (input_page_boundry[11:8] == 4'd0) ? input_page_boundry[7:0] : 8'd255;
            write_count_plus_one <= (input_page_boundry[11:8] == 4'd0) ? ({1'b0,input_page_boundry[7:0]} + 9'd1) : 9'd256;
            write_ctrl_valid <= 1'b1;
            if (write_ctrl_ready)
               input_state <= INPUT4; // Pre-emptive ACK
            else
               input_state <= INPUT3; // Wait for ACK
         end
         //
         // INPUT2.
         // Caused by timeout of input FIFO. (occupied_input was implicitly less than 256 last cycle)
         // Request write burst of lesser of:
         // 1) Entries until page boundry crossed
         // 2) Entries in input FIFO
         //
         INPUT2: begin
            // Replicated write logic to break a read timing critical path for write_count
            write_count <= (input_page_boundry < ({3'h0,occupied_input[8:0]} - 12'd1)) ? input_page_boundry[7:0] : (occupied_input[8:0] - 9'd1);
            write_count_plus_one <= (input_page_boundry < ({3'h0,occupied_input[8:0]} - 12'd1)) ? ({1'b0,input_page_boundry[7:0]} + 9'd1) : occupied_input[8:0];
            write_ctrl_valid <= 1'b1;
            if (write_ctrl_ready)
               input_state <= INPUT4; // Pre-emptive ACK
            else
               input_state <= INPUT3; // Wait for ACK
         end
         //
         // INPUT3.
         // Wait in this state for AXI4_DMA engine to accept transaction.
         //
         INPUT3: begin
            if (write_ctrl_ready) begin
               write_ctrl_valid <= 1'b0;
               input_state <= INPUT4; // ACK
            end else begin
               write_ctrl_valid <= 1'b1;
               input_state <= INPUT3; // Wait for ACK
            end
         end
         //
         // INPUT4.
         // Wait here until write_ctrl_ready_deasserts.
         // This is important as the next time it asserts we know that a write response was receieved.
         INPUT4: begin
            write_ctrl_valid <= 1'b0;
            if (!write_ctrl_ready)
               input_state <= INPUT5; // Move on
            else
               input_state <= INPUT4; // Wait for deassert
         end   
         //
         // INPUT5.
         // Transaction has been accepted by AXI4 DMA engine. Now we wait for the re-assertion
         // of write_ctrl_ready which signals that the AXI4 DMA engine has receieved a response
         // for the whole write transaction and we assume that this means it is commited to DRAM.
         // We are now free to update write_addr pointer and go back to idle state.
         // 
         INPUT5: begin
            write_ctrl_valid <= 1'b0;
            if (write_ctrl_ready) begin
               write_addr <= ((write_addr + (write_count_plus_one << 3)) & set_fifo_addr_mask_bar) | (write_addr & set_fifo_addr_mask);
               input_state <= INPUT6;
               update_write <= 1'b1;
            end else begin
               input_state <= INPUT5;
            end
         end
         //
         // INPUT6:
         // Need to let space update before looking if there's more to do.
         //
         INPUT6: begin
            input_state <= INPUT_IDLE;
            update_write <= 1'b0;
         end

         default: 
            input_state <= INPUT_IDLE;
      endcase // case(input_state)


   //
   // Simple output timeout counter for now
   //
   always @(posedge dram_clk)
      if (dram_reset | clear) begin
         output_timeout_count <= 9'd0;
         output_timeout_triggered <= 1'b0;
      end else if (output_timeout_reset) begin
         output_timeout_count <= 9'd0;
         output_timeout_triggered <= 1'b0;
      end else if (output_timeout_count == set_timeout[8:0]) begin
         output_timeout_triggered <= 1'b1;
      end else if (output_state == OUTPUT_IDLE) begin
         output_timeout_count <= output_timeout_count + ((occupied != 'd0) ? 9'd1 : 9'd0);
     end


   //
   // Wait for 64 entries in main FIFO to trigger DRAM read burst.
   // Timeout can also trigger burst so fragments of data are not left to rot in the main FIFO.
   // Also if enough data is present in the main FIFO to complete a burst upto the edge
   // of a 4KByte page then immediately start the burst.
   //
   always @(posedge dram_clk)
      if (dram_reset | clear) begin
         output_state <= OUTPUT_IDLE;
         read_addr <= set_fifo_base_addr & set_fifo_addr_mask;
         output_timeout_reset <= 1'b0;
         read_ctrl_valid <= 1'b0;
         read_count <= 8'd0;
         read_count_plus_one <= 9'd1;
         update_read <= 1'b0;
      end else
         case (output_state)
         //
         // OUTPUT_IDLE.
         // To start an output tranfer from DRAM
         // 1) Space in the small output FIFO 
         // and either
         // 2) 256 entrys in the DRAM FIFO
         // or
         // 3) Timeout waiting for more data.
         //
         OUTPUT_IDLE: begin
            read_ctrl_valid <= 1'b0;
            update_read <= 1'b0;
            if (space_output[15:8] != 'd0) begin // (space_output > 255): Space in the output FIFO.
               if (occupied[AWIDTH-3:8] != 'd0) begin // (occupied > 255): 64 or more entrys in main FIFO
                  output_state <= OUTPUT1;
                  output_timeout_reset <= 1'b1;
                  // Calculate number of entries remaining until next 4KB page boundry is crossed minus 1.
                  // Note, units of calculation are 64bit wide words. Address is always 64bit alligned.
                  output_page_boundry <= {read_addr[AWIDTH-1:12],9'h1ff} - read_addr[AWIDTH-1:3];
               end else if (output_timeout_triggered) begin // output FIFO timeout waiting for new data.
                  output_state <= OUTPUT2;
                  output_timeout_reset <= 1'b1;
                  // Calculate number of entries remaining until next 4KB page boundry is crossed minus 1.
                  // Note, units of calculation are 64bit wide words. Address is always 64bit alligned.
                  output_page_boundry <= {read_addr[AWIDTH-1:12],9'h1ff} - read_addr[AWIDTH-1:3];
               end else begin
                  output_timeout_reset <= 1'b0;
                  output_state <= OUTPUT_IDLE;
               end
            end else begin
               output_timeout_reset <= 1'b0;
               output_state <= OUTPUT_IDLE;
            end
         end // case: OUTPUT_IDLE
         //
         // OUTPUT1.
         // Caused by main FIFO reaching 256 entries.
         // Request read burst of lesser of lesser of:
         // 1) Entrys until page boundry crossed
         // 2) 256.
         //
         OUTPUT1: begin
            // Replicated write logic to break a read timing critical path for read_count
            read_count <= (output_page_boundry[11:8] == 4'd0) ? output_page_boundry[7:0] : 8'd255;
            read_count_plus_one <= (output_page_boundry[11:8] == 4'd0) ? ({1'b0,output_page_boundry[7:0]} + 9'd1) : 9'd256;
            read_ctrl_valid <= 1'b1;
            if (read_ctrl_ready)
               output_state <= OUTPUT4; // Pre-emptive ACK
            else
               output_state <= OUTPUT3; // Wait for ACK
         end
         //
         // OUTPUT2.
         // Caused by timeout of main FIFO
         // Request read burst of lesser of:
         // 1) Entries until page boundry crossed
         // 2) Entries in main FIFO
         //
         OUTPUT2: begin
            // Replicated write logic to break a read timing critical path for read_count
            read_count <= (output_page_boundry < occupied_minus_one) ? output_page_boundry[7:0] : occupied_minus_one[7:0];
            read_count_plus_one <= (output_page_boundry < occupied_minus_one) ? ({1'b0,output_page_boundry[7:0]} + 9'd1) : {1'b0, occupied[7:0]};
            read_ctrl_valid <= 1'b1;
            if (read_ctrl_ready)
               output_state <= OUTPUT4; // Pre-emptive ACK
            else
               output_state <= OUTPUT3; // Wait for ACK
         end
         //
         // OUTPUT3.
         // Wait in this state for AXI4_DMA engine to accept transaction.
         //
         OUTPUT3: begin
            if (read_ctrl_ready) begin
               read_ctrl_valid <= 1'b0;
               output_state <= OUTPUT4; // ACK
            end else begin
               read_ctrl_valid <= 1'b1;
               output_state <= OUTPUT3; // Wait for ACK
            end
         end
         //
         // OUTPUT4.
         // Wait here unitl read_ctrl_ready_deasserts.
         // This is important as the next time it asserts we know that a read response was receieved.
         OUTPUT4: begin
            read_ctrl_valid <= 1'b0;
            if (!read_ctrl_ready)
               output_state <= OUTPUT5; // Move on
            else
               output_state <= OUTPUT4; // Wait for deassert
         end   
         //
         // OUTPUT5.
         // Transaction has been accepted by AXI4 DMA engine. Now we wait for the re-assertion
         // of read_ctrl_ready which signals that the AXI4 DMA engine has receieved a last signal and good response
         // for the whole read transaction.
         // We are now free to update read_addr pointer and go back to idle state.
         // 
         OUTPUT5: begin
            read_ctrl_valid <= 1'b0;
            if (read_ctrl_ready) begin
               read_addr <= ((read_addr + (read_count_plus_one << 3)) & set_fifo_addr_mask_bar) | (read_addr & set_fifo_addr_mask);
               output_state <= OUTPUT6;
               update_read <= 1'b1;
            end else begin
               output_state <= OUTPUT5;
            end
         end // case: OUTPUT5
         //
         // OUTPUT6.
         // Need to get occupied value updated before checking if there's more to do.
         //
         OUTPUT6: begin
            update_read <= 1'b0;
            output_state <= OUTPUT_IDLE;
         end

         default: 
            output_state <= OUTPUT_IDLE;
       endcase // case(output_state)

   //
   // Count number of used entries in main DRAM FIFO.
   // Note that this is expressed in units of 64bit wide words.
   //
   always @(posedge dram_clk)
      if (dram_reset | clear) begin
         occupied <= 'd0;
         occupied_minus_one <= {(AWIDTH-2){1'b1}};
      end else begin
         occupied <= occupied + (update_write ? write_count_plus_one : 9'd0) - (update_read ? read_count_plus_one : 9'd0);
         occupied_minus_one <= occupied_minus_one + (update_write ? write_count_plus_one : 9'd0) - (update_read ? read_count_plus_one : 9'd0);
      end

   always @(posedge dram_clk)
      if (dram_reset | clear)
         space <= set_fifo_addr_mask_bar[AWIDTH-1:3] & ~('d63); // Subtract 64 from space to make allowance for read/write reordering in DRAM controller
      else
         space <= space - (update_write ? write_count_plus_one : 9'd0) + (update_read ? read_count_plus_one : 9'd0);

   //
   // Instamce of axi_dma_master
   //
   axi_dma_master axi_dma_master_i
   (
      .aclk(dram_clk), // input aclk
      .areset(dram_reset | clear), // input aresetn
      // Write control
      .m_axi_awid(m_axi_awid), // input [0 : 0] m_axi_awid
      .m_axi_awaddr(m_axi_awaddr), // input [31 : 0] m_axi_awaddr
      .m_axi_awlen(m_axi_awlen), // input [7 : 0] m_axi_awlen
      .m_axi_awsize(m_axi_awsize), // input [2 : 0] m_axi_awsize
      .m_axi_awburst(m_axi_awburst), // input [1 : 0] m_axi_awburst
      .m_axi_awvalid(m_axi_awvalid), // input m_axi_awvalid
      .m_axi_awready(m_axi_awready), // output m_axi_awready
      .m_axi_awlock(m_axi_awlock),
      .m_axi_awcache(m_axi_awcache),
      .m_axi_awprot(m_axi_awprot),
      .m_axi_awqos(m_axi_awqos),
      .m_axi_awregion(m_axi_awregion),
      .m_axi_awuser(m_axi_awuser),
      // Write Data
      .m_axi_wdata(m_axi_wdata), // input [63 : 0] m_axi_wdata
      .m_axi_wstrb(m_axi_wstrb), // input [7 : 0] m_axi_wstrb
      .m_axi_wlast(m_axi_wlast), // input m_axi_wlast
      .m_axi_wvalid(m_axi_wvalid), // input m_axi_wvalid
      .m_axi_wready(m_axi_wready), // output m_axi_wready
      .m_axi_wuser(m_axi_wuser),
      // Write Response
      .m_axi_bid(m_axi_bid), // output [0 : 0] m_axi_bid
      .m_axi_bresp(m_axi_bresp), // output [1 : 0] m_axi_bresp
      .m_axi_bvalid(m_axi_bvalid), // output m_axi_bvalid
      .m_axi_bready(m_axi_bready), // input m_axi_bready
      .m_axi_buser(m_axi_buser),
      // Read Control
      .m_axi_arid(m_axi_arid), // input [0 : 0] m_axi_arid
      .m_axi_araddr(m_axi_araddr), // input [31 : 0] m_axi_araddr
      .m_axi_arlen(m_axi_arlen), // input [7 : 0] m_axi_arlen
      .m_axi_arsize(m_axi_arsize), // input [2 : 0] m_axi_arsize
      .m_axi_arburst(m_axi_arburst), // input [1 : 0] m_axi_arburst
      .m_axi_arvalid(m_axi_arvalid), // input m_axi_arvalid
      .m_axi_arready(m_axi_arready), // output m_axi_arready
      .m_axi_arlock(m_axi_arlock),
      .m_axi_arcache(m_axi_arcache),
      .m_axi_arprot(m_axi_arprot),
      .m_axi_arqos(m_axi_arqos),
      .m_axi_arregion(m_axi_arregion),
      .m_axi_aruser(m_axi_aruser),
      // Read Data
      .m_axi_rid(m_axi_rid), // output [0 : 0] m_axi_rid
      .m_axi_rdata(m_axi_rdata), // output [63 : 0] m_axi_rdata
      .m_axi_rresp(m_axi_rresp), // output [1 : 0] m_axi_rresp
      .m_axi_rlast(m_axi_rlast), // output m_axi_rlast
      .m_axi_rvalid(m_axi_rvalid), // output m_axi_rvalid
      .m_axi_rready(m_axi_rready), // input m_axi_rready
      .m_axi_ruser(m_axi_ruser),
      //
      // DMA interface for Write transaction
      //
      .write_addr({{(32-AWIDTH){1'b0}}, write_addr}),       // Byte address for start of write transaction (should be 64bit alligned)
      .write_count(write_count),       // Count of 64bit words to write.
      .write_ctrl_valid(write_ctrl_valid),
      .write_ctrl_ready(write_ctrl_ready),
      .write_data(i_tdata_input),
      .write_data_valid(i_tvalid_input),
      .write_data_ready(i_tready_input),
      //
      // DMA interface for Read
      //
      .read_addr({{(32-AWIDTH){1'b0}}, read_addr}),       // Byte address for start of read transaction (should be 64bit alligned)
      .read_count(read_count),       // Count of 64bit words to read.
      .read_ctrl_valid(read_ctrl_valid),
      .read_ctrl_ready(read_ctrl_ready),
      .read_data(o_tdata_output),
      .read_data_valid(o_tvalid_output),
      .read_data_ready(o_tready_output),
      //
      // Debug
      //
      .debug()
   );

  //ila_axi_dma_fifo inst_ila (
  //  .clk(ce_clk), // input wire clk
  //  .probe0(rb_bist_status), // input wire [3:0]  probe0  channel 0
  //  .probe1(), // input wire [3:0]  probe0  channel 0
  //);




 endmodule // axi_dma_fifo