diff --git a/hdl/rtl/p2l_dma_master.vhd b/hdl/rtl/p2l_dma_master.vhd
index 9841d9f1545f2407cad72f5a8aa6d362e3d4253d..29a616da6419ae49019102e5c0bb7289006662d2 100644
--- a/hdl/rtl/p2l_dma_master.vhd
+++ b/hdl/rtl/p2l_dma_master.vhd
@@ -15,10 +15,14 @@
 -- waiting for the WB side to write all data. This can be a problem, especially
 -- when the WB side is slower and the next DMA transaction is a read which will
 -- switch the WB signals to the L2P module, thus cutting the previous WB write
--- transfer from this module in the middle.
+-- transfer from this module in the middle. ***** HOPEFULLY THIS IS NOW FIXED!!!
+--
+-- The fix will hurt performance, Ideally we would seperately signal PCIE_COMPLETE
+-- and WB_COMPLETE in that way the controller can mask this impact in some sequences
+-- of transactions.
 --
 --------------------------------------------------------------------------------
--- Copyright CERN 2010-2020
+-- Copyright CERN 2010-2021
 --------------------------------------------------------------------------------
 -- Copyright and related rights are licensed under the Solderpad Hardware
 -- License, Version 2.0 (the "License"); you may not use this file except
@@ -42,8 +46,10 @@ use work.genram_pkg.all;
 
 entity p2l_dma_master is
   generic (
-    g_FIFO_SIZE : positive := 64;
-    g_BYTE_SWAP : boolean  := FALSE);
+    g_FIFO_SIZE : positive                     := 64;
+    g_BYTE_SWAP : boolean                      := FALSE;
+    g_TID_CID   : std_logic_vector(1 downto 0) := "01"
+  );
   port (
       ---------------------------------------------------------
       -- GN4124 core clock and reset
@@ -127,6 +133,8 @@ architecture arch of p2l_dma_master is
   -- c_MAX_READ_REQ_SIZE is the maximum size (in 32-bit words) of the payload of a packet.
   -- Allowed c_MAX_READ_REQ_SIZE values are: 32, 64, 128, 256, 512, 1024.
   -- This constant must be set according to the GN4124 and motherboard chipset capabilities.
+  --
+  -- Or should the driver read the value configured in the gennum, and sets it in the DMA_CSR
   constant c_MAX_READ_REQ_SIZE     : unsigned(10 downto 0) := to_unsigned(1024, 11);
 
   -----------------------------------------------------------------------------
@@ -162,9 +170,9 @@ architecture arch of p2l_dma_master is
   signal to_wb_fifo_rd        : std_logic;
   signal to_wb_fifo_wr        : std_logic;
   signal to_wb_fifo_wr_d      : std_logic;
-  signal to_wb_fifo_din       : std_logic_vector(63 downto 0) := (others => '0');
-  signal to_wb_fifo_din_d     : std_logic_vector(63 downto 0) := (others => '0');
-  signal to_wb_fifo_dout      : std_logic_vector(63 downto 0);
+  signal to_wb_fifo_din       : std_logic_vector(62 downto 0) := (others => '0');
+  signal to_wb_fifo_din_d     : std_logic_vector(62 downto 0) := (others => '0');
+  signal to_wb_fifo_dout      : std_logic_vector(62 downto 0);
   signal to_wb_fifo_byte_swap : std_logic_vector(1 downto 0) := (others => '0');
 
   -- wishbone
@@ -173,7 +181,8 @@ architecture arch of p2l_dma_master is
   signal wb_dma_tfr     : boolean;
 
   -- P2L DMA read request FSM
-  type p2l_dma_state_type is (P2L_IDLE, P2L_SETUP, P2L_HEADER, P2L_ADDR_H, P2L_ADDR_L, P2L_WAIT_READ_COMPLETION);
+  type p2l_dma_state_type is (P2L_IDLE, P2L_SETUP, P2L_HEADER, P2L_ADDR_H, P2L_ADDR_L,
+                              P2L_WAIT_READ_COMPLETION, P2L_WAIT_WISHBONE_COMPLETE);
   signal p2l_dma_current_state : p2l_dma_state_type;
   signal p2l_data_cnt          : unsigned(10 downto 0) := (others => '0');
 
@@ -188,6 +197,12 @@ architecture arch of p2l_dma_master is
   signal to_wb_fifo_full_d    : std_logic_vector(c_SYNC_FIFO_FULL_DELAY - 1 downto 0) := (others => '0');
   signal to_wb_fifo_full_next : std_logic;
 
+
+  signal fsm_wb_reading_complete : std_logic;
+  signal wb_sent_all_data        : std_logic := '0';
+  signal wb_wdat_is_last         : std_logic;
+
+
 begin
   -- Errors to DMA controller
   dma_ctrl_error_o <= dma_busy_error or completion_error;
@@ -200,7 +215,7 @@ begin
   p_read_req_fsm : process (clk_i)
   begin
     if rising_edge(clk_i) then
-      if rst_n_i = '0' then
+      if rst_n_i = '0' then -- some registers are using rst_n_i as a clk_en
         p2l_dma_current_state <= P2L_IDLE;
         pdm_arb_req_o         <= '0';
         pdm_arb_valid_o       <= '0';
@@ -247,6 +262,7 @@ begin
             else
               l2p_len_header  <= l2p_len_cnt(10 downto 0);
             end if;
+
             -- request access to PCIe bus
             pdm_arb_req_o         <= '1';
             p2l_dma_current_state <= P2L_HEADER;
@@ -262,14 +278,14 @@ begin
               pdm_arb_data(27 downto 24) <= "000" & l2p_64b_address; -->  Packet type = read req (32 or 64)
               if l2p_len_header = 1 then
                 -- Last Byte Enable must be "0000" when length = 1
-                pdm_arb_data(23 downto 20) <= "0000"; -->  LBE (Last Byte Enable)
+                pdm_arb_data(23 downto 20) <= "0000";  -->  LBE (Last Byte Enable)
               else
-                pdm_arb_data(23 downto 20) <= "1111"; -->  LBE (Last Byte Enable)
+                pdm_arb_data(23 downto 20) <= "1111";  -->  LBE (Last Byte Enable)
               end if;
-              pdm_arb_data(19 downto 16) <= "1111"; -->  FBE (First Byte Enable)
-              pdm_arb_data(15 downto 13) <= "111";  -->  Reserved
-              pdm_arb_data(12)           <= '0';    -->  VC (Virtual Channel)
-              pdm_arb_data(11 downto 10) <= "01";   -->  CID
+              pdm_arb_data(19 downto 16) <= "1111";    -->  FBE (First Byte Enable)
+              pdm_arb_data(15 downto 13) <= "111";     -->  Reserved
+              pdm_arb_data(12)           <= '0';       -->  VC (Virtual Channel)
+              pdm_arb_data(11 downto 10) <= g_TID_CID; -->  CID
               pdm_arb_data(9  downto 0)  <= std_logic_vector(l2p_len_header (9 downto 0));  -->  Length (in words)
               pdm_arb_valid_o  <= '1';
               pdm_arb_dframe_o <= '1';
@@ -297,12 +313,14 @@ begin
             l2p_len_cnt <= l2p_len_cnt - l2p_len_header;
             l2p_address_l <= std_logic_vector(unsigned(l2p_address_l) + 4*l2p_len_header);
             p2l_dma_current_state <= P2L_WAIT_READ_COMPLETION;
-            
+
           when P2L_WAIT_READ_COMPLETION =>
             -- End of the read request packet
             pdm_arb_valid_o <= '0';
-            
-            if pd_pdm_data_valid_i = '1' and pd_pdm_master_cpld_i = '1' then
+
+            if      pd_pdm_data_valid_i = '1'
+                and pd_pdm_master_cpld_i = '1'
+                and pd_pdm_hdr_cid_i = g_TID_CID then
               --  Received a word.
               l2p_len_header <= l2p_len_header - 1;
             end if;
@@ -310,8 +328,10 @@ begin
             if dma_ctrl_abort_i = '1' then
               rx_error_t            <= '1';
               p2l_dma_current_state <= P2L_IDLE;
-            elsif pd_pdm_master_cpld_i = '1' and pd_pdm_data_last_i = '1'
+            elsif     pd_pdm_master_cpld_i = '1'
+                  and pd_pdm_data_last_i = '1'
                   and l2p_len_header = 1
+                  and pd_pdm_hdr_cid_i = g_TID_CID
             then
               -- Note: a read request may result in multiple read completion.
               -- last word of read completion has been received
@@ -321,18 +341,30 @@ begin
               else
                 -- indicate end of DMA transfer
                 if is_next_item = '1' then
-                  next_item_valid_o <= '1';
+                  next_item_valid_o     <= '1';
+                  p2l_dma_current_state <= P2L_IDLE;
                 else
-                  dma_ctrl_done_t <= '1';
+                  p2l_dma_current_state <= P2L_WAIT_WISHBONE_COMPLETE;
+                  --dma_ctrl_done_t <= '1';
                 end if;
-                p2l_dma_current_state <= P2L_IDLE;
+
               end if;
-            elsif pd_pdm_master_cpln_i = '1' then
+            elsif    pd_pdm_master_cpln_i = '1'
+                 and pd_pdm_hdr_cid_i = g_TID_CID
+            then
               -- should not return a read completion without data
               completion_error      <= '1';
-              p2l_dma_current_state <= P2L_IDLE;
+              p2l_dma_current_state <= P2L_IDLE;   -- TBD error handling, and all wishbone_dat_done..
             end if;
 
+          when P2L_WAIT_WISHBONE_COMPLETE =>
+             if dma_ctrl_abort_i = '1' then
+                p2l_dma_current_state <= P2L_IDLE;
+             elsif fsm_wb_reading_complete = '1' then
+                dma_ctrl_done_t       <= '1';
+                p2l_dma_current_state <= P2L_IDLE;
+             end if;
+
           when others =>
             p2l_dma_current_state <= P2L_IDLE;
             pdm_arb_req_o         <= '0';
@@ -372,9 +404,10 @@ begin
       if p2l_dma_current_state = P2L_ADDR_L then
           -- Store number of 32-bit data words to be received for the current read request
         p2l_data_cnt <= l2p_len_header;
-      elsif p2l_dma_current_state = P2L_WAIT_READ_COMPLETION
-            and pd_pdm_data_valid_i = '1'
+      elsif p2l_dma_current_state    = P2L_WAIT_READ_COMPLETION
+            and pd_pdm_data_valid_i  = '1'
             and pd_pdm_master_cpld_i = '1'
+            and pd_pdm_hdr_cid_i = g_TID_CID
       then
           -- decrement number of data to be received
         p2l_data_cnt <= p2l_data_cnt - 1;
@@ -388,8 +421,10 @@ begin
   p_next_item : process (clk_i)
   begin
     if rising_edge(clk_i) then
-      if p2l_dma_current_state = P2L_WAIT_READ_COMPLETION
-         and is_next_item = '1' and pd_pdm_data_valid_i = '1'
+      if     p2l_dma_current_state = P2L_WAIT_READ_COMPLETION
+         and is_next_item = '1'
+         and pd_pdm_data_valid_i = '1'
+         and pd_pdm_hdr_cid_i = g_TID_CID
       then
         -- next item data are supposed to be received in the right order !!
         case p2l_data_cnt(2 downto 0) is
@@ -400,7 +435,7 @@ begin
           when "101" =>
             next_item_host_addr_h <= pd_pdm_data_i;
           when "100" =>
-            next_item_len <= pd_pdm_data_i;
+            next_item_len    <= pd_pdm_data_i;
           when "011" =>
             next_item_next_l <= pd_pdm_data_i;
           when "010" =>
@@ -428,7 +463,7 @@ begin
   p_addr_cnt : process (clk_i)
   begin
     if rising_edge(clk_i) then
-      if rst_n_i = '0' then
+      if rst_n_i = '0' then -- there are definitely registers using rst_n_i as a clk_en in this process
         dma_busy_error       <= '0';
         to_wb_fifo_wr        <= '0';
         to_wb_fifo_wr_d      <= '0';
@@ -436,6 +471,12 @@ begin
         to_wb_fifo_din_d <= to_wb_fifo_din;
         to_wb_fifo_wr_d  <= to_wb_fifo_wr;
 
+        if p2l_dma_current_state /= P2L_IDLE and dma_ctrl_start_p2l_i = '1'  then
+           dma_busy_error <= '1';
+        else
+           dma_busy_error <= '0';
+        end if;
+
         if dma_ctrl_start_p2l_i = '1' then
           if p2l_dma_current_state = P2L_IDLE then
             -- dma_ctrl_target_addr_i is a byte address and target_addr_cnt is a
@@ -443,11 +484,11 @@ begin
             target_addr_cnt      <= unsigned(dma_ctrl_carrier_addr_i(31 downto 2));
             -- stores byte swap info for the current DMA transfer
             to_wb_fifo_byte_swap <= dma_ctrl_byte_swap_i;
-          else
-            dma_busy_error <= '1';
           end if;
-        elsif p2l_dma_current_state = P2L_WAIT_READ_COMPLETION
-              and is_next_item = '0' and pd_pdm_data_valid_i = '1'
+        elsif     p2l_dma_current_state = P2L_WAIT_READ_COMPLETION
+              and is_next_item = '0'
+              and pd_pdm_data_valid_i = '1'
+              and pd_pdm_hdr_cid_i = g_TID_CID
         then
           -- write target address and data to the sync fifo
           to_wb_fifo_wr                <= '1';
@@ -455,8 +496,17 @@ begin
           to_wb_fifo_din(61 downto 32) <= std_logic_vector(target_addr_cnt);
           -- increment target address counter
           target_addr_cnt              <= target_addr_cnt + 1;
+
+          -- indicate that this is the last "beat" of the transfer so we can control the switch when the wishbone interface is done!
+          if l2p_len_cnt = 0 and pd_pdm_data_last_i = '1' and l2p_len_header = 1 then
+            to_wb_fifo_din(62) <= '1';
+          else
+            to_wb_fifo_din(62) <= '0';
+          end if;
+
+          -- NOTE we dont do anything for error conditions, Assumption DMA controller stops..
+
         else
-          dma_busy_error <= '0';
           to_wb_fifo_wr  <= '0';
         end if;
       end if;
@@ -497,32 +547,37 @@ begin
 
   to_wb_fifo_full <= to_wb_fifo_full_d(to_wb_fifo_full_d'high);
 
+  -- TBD: should we monitor the fill level: e.g. dont issue a write until there is space in the
+  --  receiving fifo to avoid backpressure on the bus
   cmp_to_wb_fifo : generic_async_fifo_dual_rst
     generic map (
-      g_DATA_WIDTH            => 64,
+      g_DATA_WIDTH            => 63,
       g_SIZE                  => g_FIFO_SIZE,
       g_SHOW_AHEAD            => TRUE,
       g_WITH_WR_FULL          => FALSE,
       g_WITH_WR_ALMOST_FULL   => TRUE,
       -- 20 less to give time to the GN4124 to react to P2L_RDY going low.
-      g_ALMOST_FULL_THRESHOLD => g_FIFO_SIZE - c_SYNC_FIFO_FULL_DELAY - 20)
-    port map (
+      g_ALMOST_FULL_THRESHOLD => g_FIFO_SIZE - c_SYNC_FIFO_FULL_DELAY - 20
+    ) port map (
       -- write port
       rst_wr_n_i       => fifo_rst_n,
       clk_wr_i         => clk_i,
       d_i              => to_wb_fifo_din_d,
       we_i             => to_wb_fifo_wr_d,
       wr_almost_full_o => to_wb_fifo_full_next,
+      wr_count_o       => open,
       -- read port
       rst_rd_n_i       => wb_fifo_rst_n,
       clk_rd_i         => wb_dma_clk_i,
       q_o              => to_wb_fifo_dout,
       rd_i             => to_wb_fifo_rd,
-      rd_empty_o       => to_wb_fifo_empty);
+      rd_empty_o       => to_wb_fifo_empty
+    );
 
   -- pause transfer from GN4124 if fifo is (almost) full
   p2l_rdy_o <= not(to_wb_fifo_full);
 
+
   ------------------------------------------------------------------------------
   -- Wishbone master (write only)
   ------------------------------------------------------------------------------
@@ -545,43 +600,74 @@ begin
   p_wb_master : process (wb_dma_clk_i)
   begin
     if rising_edge(wb_dma_clk_i) then
-      if wb_fifo_rst_n = '0' then
-        wb_dma_o.cyc <= '0';
-        wb_dma_out_stb <= '0';
-        wb_ack_cnt <= (others => '0');
-      else
-        if to_wb_fifo_rd = '1' then
-          --  Data available, read them from the fifo.
-          wb_dma_o.adr(31 downto 30) <= "00";
-          wb_dma_o.adr(29 downto 0)  <= to_wb_fifo_dout(61 downto 32);
-          wb_dma_o.dat               <= to_wb_fifo_dout(31 downto 0);
-
-          -- Data/addresses are valid when fifo was just read.
-          wb_dma_out_stb <= '1';
-          wb_dma_o.cyc <= '1';
-        else
-          --  No read.
-          if wb_dma_out_stb = '1' and wb_dma_i.stall = '1' then
-            --  Data were not read, just wait.
-            null;
-          elsif to_wb_fifo_empty = '1' then
-            --  No more data to produce.
-            wb_dma_out_stb <= '0';
+      wb_sent_all_data <= '0';
+      if to_wb_fifo_rd = '1' then
+        --  Data available, read them from the fifo.
+        wb_dma_o.adr(31 downto 30) <= "00";
+        wb_dma_o.adr(29 downto 0)  <= to_wb_fifo_dout(61 downto 32);
+        wb_dma_o.dat               <= to_wb_fifo_dout(31 downto 0);
+
+
+        -- use to_wb_fifo_dout(62) to flag the last trasnfer!!!  track it and send it back to the SM to indicate done!
+        wb_wdat_is_last <= to_wb_fifo_dout(62);
 
-            if wb_ack_cnt = 0 then
-              --  End of the burst
-              wb_dma_o.cyc <= '0';
+        -- Data/addresses are valid when fifo was just read.
+        wb_dma_out_stb <= '1';
+        wb_dma_o.cyc   <= '1';
+      else
+        --  No read.
+        if wb_dma_out_stb = '1' and wb_dma_i.stall = '1' then
+          --  Data were not read, just wait.
+          null;
+        elsif to_wb_fifo_empty = '1' then
+          --  No more data to produce.
+          wb_dma_out_stb <= '0';
+
+          if wb_ack_cnt = 0 then
+            --  End of the burst
+            wb_dma_o.cyc <= '0';
+
+            if wb_wdat_is_last = '1' then
+              wb_sent_all_data <= '1';
+              wb_wdat_is_last  <= '0';
             end if;
+
           end if;
         end if;
+      end if;
 
-        --  Track number of expected ack.
-        if wb_dma_tfr and wb_dma_i.ack = '0' then
-          wb_ack_cnt <= wb_ack_cnt + 1;
-        elsif not wb_dma_tfr and wb_dma_i.ack = '1' then
-          wb_ack_cnt <= wb_ack_cnt - 1;
-        end if;
+      --  Track number of expected ack.
+      if wb_dma_tfr and wb_dma_i.ack = '0' then
+        wb_ack_cnt <= wb_ack_cnt + 1;
+      elsif not wb_dma_tfr and wb_dma_i.ack = '1' then
+        wb_ack_cnt <= wb_ack_cnt - 1;
+      end if;
+
+      -- we dont want to sync reset all signals and not wanting to infer a clk_en we do it like this.
+      if wb_fifo_rst_n = '0' then
+        wb_dma_o.cyc   <= '0';
+        wb_dma_out_stb <= '0';
+        wb_ack_cnt     <= (others => '0');
       end if;
+
     end if;
   end process p_wb_master;
+
+
+   -- TBD: should we use the flowcontrol aspects of this to backpressure WB?
+   -- That would allow a counter of outstanding "transactions" to count off so
+   -- we know they are all completed:
+   --  e.g. separate pcie_done_o and wb_done_o signals to the controller.
+  pulse_sync_wb_done : entity work.gc_pulse_synchronizer2
+    port map (
+      clk_in_i    => wb_dma_clk_i,
+      rst_in_n_i  => wb_fifo_rst_n,
+      clk_out_i   => clk_i,
+      rst_out_n_i => fifo_rst_n,
+      d_ready_o   => open,
+      d_ack_p_o   => open,
+      d_p_i       => wb_sent_all_data,
+      q_p_o       => fsm_wb_reading_complete
+  );
+
 end architecture arch;