From c19e89b03bc1ddddcb9585636b513ad9df6326c7 Mon Sep 17 00:00:00 2001
From: "Wesley W. Terpstra" <w.terpstra@gsi.de>
Date: Tue, 17 Apr 2012 11:09:14 +0200
Subject: [PATCH] TX path added to altera pcie wrapper stub wishbone device
 added for testing

---
 hdl/pcie_altera.vhd | 138 +++++++++++++++++++++++++++++++++++++++++---
 hdl/pcie_tlp.vhd    |  37 +++++++++++-
 hdl/pcie_wb.vhd     |  38 +++++++++---
 hdl/pcie_wb_pkg.vhd |  22 +++++--
 4 files changed, 212 insertions(+), 23 deletions(-)

diff --git a/hdl/pcie_altera.vhd b/hdl/pcie_altera.vhd
index 81072b1f..03a5fe1b 100644
--- a/hdl/pcie_altera.vhd
+++ b/hdl/pcie_altera.vhd
@@ -23,9 +23,14 @@ entity pcie_altera is
     rx_wb_dat_o   : out std_logic_vector(31 downto 0);
     rx_wb_stall_i : in  std_logic;
     
-    tx_wb_stb_i   : in  std_logic;
-    tx_wb_dat_i   : in  std_logic_vector(31 downto 0);
-    tx_wb_stall_o : out std_logic);
+    -- pre-allocate buffer space used for TX
+    tx_rdy_o      : out std_logic;
+    tx_alloc_i    : in  std_logic; -- may only set '1' if rdy_o = '1'
+    
+    -- push TX data
+    tx_en_i       : in  std_logic; -- may never exceed alloc_i
+    tx_dat_i      : in  std_logic_vector(31 downto 0);
+    tx_eop_i      : in  std_logic); -- Mark last strobe
 end pcie_altera;
 
 architecture rtl of pcie_altera is
@@ -188,6 +193,15 @@ architecture rtl of pcie_altera is
       return '0';
     end if;
   end is_zero;
+  
+  function active_high(x : boolean) return std_logic is
+  begin
+    if x then
+      return '1';
+    else
+      return '0';
+    end if;
+  end active_high;
 
   signal core_clk_out : std_logic;
   signal rstn : std_logic;
@@ -205,6 +219,8 @@ architecture rtl of pcie_altera is
   signal npor, crst, srst, rst_reg : std_logic;
   signal pme_shift : std_logic_vector(4 downto 0);
   
+  -- RX registers and signals
+  
   signal rx_st_ready0, rx_st_valid0 : std_logic;
   signal rx_st_be0 : std_logic_vector(7 downto 0);
   signal rx_st_data0 : std_logic_vector(63 downto 0);
@@ -215,6 +231,31 @@ architecture rtl of pcie_altera is
   
   signal r32_word, s32_word, s32_progress, r32_full, s32_need_refill, r32_skip, s32_enter0 : std_logic;
   signal r32_dat0, r32_dat1 : std_logic_vector(31 downto 0);
+  
+  -- TX registers and signals
+  
+  constant log_bytes  : integer := 9; -- 256 byte maximum TLP, but we allocate twice the space to simplify provisioning
+  constant buf_length : integer := (2**log_bytes)/8;
+  constant buf_bits   : integer := log_bytes-3;
+  type queue_t is array(buf_length-1 downto 0) of std_logic_vector(64 downto 0);
+  
+  signal tx_st_sop0, tx_st_eop0, tx_st_ready0, tx_st_valid0 : std_logic;
+  signal tx_st_data0 : std_logic_vector(63 downto 0);
+  signal s_eop, tx_queue_stall : std_logic;
+  signal r_sop : std_logic := '1';
+  
+  signal queue : queue_t;
+  
+  -- Invariant idxr <= idxe <= idxw <= idxa, extra bit is for wrap-around
+  signal r_idxr, r_idxw, r_idxa, r_idxe, s_idxw_p1 : unsigned(buf_bits downto 0);
+  signal r_delay_ready : std_logic_vector(1 downto 0); -- length must equal the latency of the Avalon TX bus
+  
+  signal s_queue_wdat : std_logic_vector(63 downto 0);
+  signal s_queue_wen, s_64to32_full, r_tx32_full, r_pad : std_logic;
+  
+  constant zero32 : std_logic_vector(31 downto 0) := (others => '0');
+  signal r_tx_dat0 : std_logic_vector(31 downto 0);
+  
 begin
 
   reconfig_clk <= cal_clk50_i;
@@ -277,12 +318,12 @@ begin
       r2c_err0             => open,
 
       -- Avalon TX
-      tx_st_data0          => (others => '0'),
-      tx_st_eop0           => '0',
+      tx_st_data0          => tx_st_data0,
+      tx_st_eop0           => tx_st_eop0,
       tx_st_err0           => '0',
-      tx_st_sop0           => '0',
-      tx_st_valid0         => '0',
-      tx_st_ready0         => open,
+      tx_st_sop0           => tx_st_sop0,
+      tx_st_valid0         => tx_st_valid0,
+      tx_st_ready0         => tx_st_ready0,
       tx_fifo_empty0       => open,
       tx_fifo_full0        => open,
       tx_fifo_rdptr0       => open, --  3 downto 0
@@ -490,7 +531,7 @@ begin
   -- Issue a fetch only if we need refill and no fetch is pending
   rx_st_ready0 <= s64_need_refill and is_zero(r64_ready(r64_ready'length-2 downto 0));
   
-  rx_data64: process(core_clk_out)
+  rx_data64 : process(core_clk_out)
   begin
     if rising_edge(core_clk_out) then
       if rstn = '0' then
@@ -505,4 +546,83 @@ begin
       r64_skip <= s64_skip;
     end if;
   end process;
+  
+  -- TX queue
+  tx_st_data0 <= queue(to_integer(r_idxr(buf_bits-1 downto 0)))(63 downto 0);
+  s_eop       <= queue(to_integer(r_idxr(buf_bits-1 downto 0)))(64);
+  tx_st_eop0  <= s_eop;
+  tx_st_sop0  <= r_sop;
+  
+  tx_st_valid0 <= active_high(r_idxr /= r_idxe) and r_delay_ready(r_delay_ready'length-1);
+  
+  tx_data64_r : process(core_clk_out)
+  begin
+    if rising_edge(core_clk_out) then
+      if rstn = '0' then
+        r_delay_ready <= (others => '0');
+        r_idxr <= (others => '0');
+        r_sop <= '1';
+      else
+        r_delay_ready <= r_delay_ready(r_delay_ready'length-2 downto 0) & tx_st_ready0;
+        if tx_st_valid0 = '1' then
+          r_idxr <= r_idxr + 1;
+          r_sop <= s_eop;
+        end if;
+      end if;
+    end if;
+  end process;
+  
+  -- can only accept data if A pointer has not wrapped around the buffer to point at the R pointer
+  tx_rdy_o <= active_high(r_idxa(buf_bits-1 downto 0) = r_idxr(buf_bits-1 downto 0)) and
+              active_high(r_idxa(buf_bits) /= r_idxr(buf_bits));
+  
+  s_idxw_p1 <= r_idxw + 1;
+  tx_data64_w : process(core_clk_out)
+  begin
+    if rising_edge(core_clk_out) then
+      if rstn = '0' then
+        r_idxw <= (others => '0');
+        r_idxa <= (others => '0');
+        r_idxe <= (others => '0');
+      else
+        queue(to_integer(r_idxw(buf_bits-1 downto 0))) <= tx_eop_i & s_queue_wdat;
+        
+        if s_queue_wen = '1' then
+          r_idxw <= s_idxw_p1;
+        end if;
+        
+        if (s_queue_wen and tx_eop_i) = '1' then
+          r_idxe <= s_idxw_p1;
+          r_idxa <= s_idxw_p1; -- clear over-allocation
+        end if;
+        
+        if tx_alloc_i = '1' then
+          r_idxa <= r_idxa + 1;
+        end if;
+      end if;
+    end if;
+  end process;
+  
+  s_queue_wdat <= 
+    (zero32 & tx_dat_i) when r_tx32_full = '0' else
+    (tx_dat_i & r_tx_dat0);
+  
+  s_64to32_full <= r_tx32_full or r_pad or tx_eop_i;
+  s_queue_wen <= tx_en_i and s_64to32_full;
+  
+  tx_data32 : process(core_clk_out)
+  begin
+    if rising_edge(core_clk_out) then
+      if rstn = '0' then
+        r_tx_dat0 <= (others => '0');
+        r_tx32_full <= '0';
+        r_pad <= '0';
+      else
+        if tx_en_i = '1' then
+          r_tx_dat0 <= tx_dat_i;
+          r_tx32_full <= not s_64to32_full;
+        end if;
+      end if;
+    end if;
+  end process;
 end rtl;
diff --git a/hdl/pcie_tlp.vhd b/hdl/pcie_tlp.vhd
index d941698d..b12bef5a 100644
--- a/hdl/pcie_tlp.vhd
+++ b/hdl/pcie_tlp.vhd
@@ -12,12 +12,21 @@ entity pcie_tlp is
     rx_wb_dat_i   : in  std_logic_vector(31 downto 0);
     rx_wb_stall_o : out std_logic;
     
+    tx_rdy_i      : out std_logic;
+    tx_alloc_o    : in  std_logic;
+    tx_en_o       : in  std_logic;
+    tx_dat_o      : in  std_logic_vector(31 downto 0);
+    tx_eop_o      : in  std_logic;
+    
     wb_stb_o      : out std_logic;
     wb_adr_o      : out std_logic_vector(63 downto 0);
     wb_we_o       : out std_logic;
     wb_dat_o      : out std_logic_vector(31 downto 0);
     wb_sel_o      : out std_logic_vector(3 downto 0);
-    wb_stall_i    : in  std_logic);
+    wb_stall_i    : in  std_logic;
+    wb_ack_i      : in  std_logic;
+    wb_err_i      : in  std_logic;
+    wb_dat_i      : in  std_logic_vector(31 downto 0));
 end pcie_tlp;
 
 architecture rtl of pcie_tlp is
@@ -54,9 +63,14 @@ architecture rtl of pcie_tlp is
   -- Stall and strobe bypass mux
   signal r_always_stall, r_never_stall : std_logic;
   signal r_always_stb,   r_never_stb   : std_logic;
+  
+  -- Inflight reads and writes
+  signal wb_stb : std_logic;
+  signal r_flight_count : unsigned(4 downto 0);
 begin
   rx_wb_stall_o <= r_always_stall or (not r_never_stall and wb_stall_i);
-  wb_stb_o <= r_always_stb or (not r_never_stb and rx_wb_stb_i);
+  wb_stb <= r_always_stb or (not r_never_stb and rx_wb_stb_i);
+  wb_stb_o <= wb_stb;
   wb_adr_o <= r_address;
   wb_dat_o <= rx_wb_dat_i;
   
@@ -251,4 +265,23 @@ begin
       end if;
     end if;
   end process;
+  
+  flight_counter : process(clk_i)
+  begin
+    if rising_edge(clk_i) then
+      if (wb_ack_i or wb_err_i) = '1' then
+        if wb_stb = '1' then
+          r_flight_count <= r_flight_count;
+        else
+          r_flight_count <= r_flight_count - 1;
+        end if;
+      else
+        if wb_stb = '1' then
+          r_flight_count <= r_flight_count + 1;
+        else
+          r_flight_count <= r_flight_count;
+        end if;
+      end if;
+    end if;
+  end process;
 end rtl;
diff --git a/hdl/pcie_wb.vhd b/hdl/pcie_wb.vhd
index 7ebba6bb..9d597d4b 100644
--- a/hdl/pcie_wb.vhd
+++ b/hdl/pcie_wb.vhd
@@ -46,6 +46,13 @@ architecture rtl of pcie_wb is
   
   signal rx_wb_stb, rx_wb_stall : std_logic;
   signal rx_wb_dat : std_logic_vector(31 downto 0);
+  
+  signal tx_rdy, tx_alloc, tx_en, tx_eop : std_logic;
+  signal tx_dat : std_logic_vector(31 downto 0);
+  
+  signal wb_stb_o, wb_we_o, wb_ack_i : std_logic;
+  signal wb_dat_o, wb_dat_i, demo_reg : std_logic_vector(31 downto 0);
+  
 begin
 
   reset : pow_reset
@@ -76,10 +83,11 @@ begin
     rx_wb_stb_o   => rx_wb_stb,
     rx_wb_dat_o   => rx_wb_dat,
     rx_wb_stall_i => rx_wb_stall,
-    -- No TX... yet.
-    tx_wb_stb_i   => '0',
-    tx_wb_dat_i   => (others => '0'),
-    tx_wb_stall_o => open);
+    tx_rdy_o      => tx_rdy,
+    tx_alloc_i    => tx_alloc,
+    tx_en_i       => tx_en,
+    tx_dat_i      => tx_dat,
+    tx_eop_i      => tx_eop);
   
   pcie_logic : pcie_tlp port map(
     clk_i         => wb_clk,
@@ -90,12 +98,26 @@ begin
     rx_wb_dat_i   => rx_wb_dat,
     rx_wb_stall_o => rx_wb_stall,
     
-    wb_stb_o      => open,
+    wb_stb_o      => wb_stb_o,
     wb_adr_o      => open,
-    wb_we_o       => open,
-    wb_dat_o      => open,
+    wb_we_o       => wb_we_o,
+    wb_dat_o      => wb_dat_o,
     wb_sel_o      => open,
-    wb_stall_i    => stall);
+    wb_stall_i    => stall,
+    wb_ack_i      => wb_ack_i,
+    wb_err_i      => '0',
+    wb_dat_i      => wb_dat_i);
+  
+  wb_dat_i <= demo_reg;
+  demo : process(wb_clk)
+  begin
+    if rising_edge(wb_clk) then
+      if (wb_stb_o and wb_we_o and not stall) = '1' then
+        demo_reg <= wb_dat_o;
+      end if;
+      wb_ack_i <= wb_stb_o and not stall;
+    end if;
+  end process;
   
   blink : process(wb_clk)
   begin
diff --git a/hdl/pcie_wb_pkg.vhd b/hdl/pcie_wb_pkg.vhd
index ccfc60cc..8e3f7cba 100644
--- a/hdl/pcie_wb_pkg.vhd
+++ b/hdl/pcie_wb_pkg.vhd
@@ -24,9 +24,14 @@ package pcie_wb_pkg is
       rx_wb_dat_o   : out std_logic_vector(31 downto 0);
       rx_wb_stall_i : in  std_logic;
       
-      tx_wb_stb_i   : in  std_logic;
-      tx_wb_dat_i   : in  std_logic_vector(31 downto 0);
-      tx_wb_stall_o : out std_logic);
+      -- pre-allocate buffer space used for TX
+      tx_rdy_o      : out std_logic;
+      tx_alloc_i    : in  std_logic; -- may only set '1' if rdy_o = '1'
+      
+      -- push TX data
+      tx_en_i       : in  std_logic; -- may never exceed alloc_i
+      tx_dat_i      : in  std_logic_vector(31 downto 0);
+      tx_eop_i      : in  std_logic); -- Mark last strobe
   end component;
   
   component pcie_tlp is
@@ -39,11 +44,20 @@ package pcie_wb_pkg is
       rx_wb_dat_i   : in  std_logic_vector(31 downto 0);
       rx_wb_stall_o : out std_logic;
       
+      tx_rdy_i      : out std_logic;
+      tx_alloc_o    : in  std_logic;
+      tx_en_o       : in  std_logic;
+      tx_dat_o      : in  std_logic_vector(31 downto 0);
+      tx_eop_o      : in  std_logic;
+      
       wb_stb_o      : out std_logic;
       wb_adr_o      : out std_logic_vector(63 downto 0);
       wb_we_o       : out std_logic;
       wb_dat_o      : out std_logic_vector(31 downto 0);
       wb_sel_o      : out std_logic_vector(3 downto 0);
-      wb_stall_i    : in  std_logic);
+      wb_stall_i    : in  std_logic;
+      wb_ack_i      : in  std_logic;
+      wb_err_i      : in  std_logic;
+      wb_dat_i      : in  std_logic_vector(31 downto 0));
   end component;
 end pcie_wb_pkg;
-- 
GitLab