diff --git a/hdl/rtl/Manifest.py b/hdl/rtl/Manifest.py
index 776e7344db3e55f0723ac79115f2729472552748..f66e2616207b37541604618dc491d3d513c1e8dc 100644
--- a/hdl/rtl/Manifest.py
+++ b/hdl/rtl/Manifest.py
@@ -3,6 +3,7 @@ files = [
+    "opt_rl0_pl_stg.vhd",
diff --git a/hdl/rtl/gn4124_axi_r_chl_dcfifo.vhd b/hdl/rtl/gn4124_axi_r_chl_dcfifo.vhd
index 1f9b4b6aa7e596710e8843ca5025f113aa879f15..18f9d839170bcf2cc34931cbedef1d055e34be5b 100644
--- a/hdl/rtl/gn4124_axi_r_chl_dcfifo.vhd
+++ b/hdl/rtl/gn4124_axi_r_chl_dcfifo.vhd
@@ -1,3 +1,7 @@
+-- SPDX-FileCopyrightText: 2023 CERN (home.cern)
+-- SPDX-License-Identifier: CERN-OHL-W-2.0+
 -- GN4124 core for PCIe FMC carrier
 -- http://www.ohwr.org/projects/gn4124-core
@@ -12,7 +16,7 @@
--- This source describes Open Hardware and is licensed under the CERN-OHL-W v2.
+-- This source describes Open Hardware and is licensed under the CERN-OHL-W v2 or later.
 -- You may redistribute and modify this source and make products using it
 -- under the terms of the CERN-OHL-W v2 or future versions (https://ohwr.org/cern_ohl_w_v2.txt).
@@ -22,7 +26,7 @@
 -- the CERN-OHL-W v2 for applicable conditions.
--- Copyright CERN 2021
+-- Copyright CERN 2023
 -- NOTES:
@@ -39,7 +43,8 @@ entity gn4124_axi_r_chl_dcfifo is
     g_DATA_WIDTH            : positive := 32;
     g_NUM_WORDS             : positive := 8;
     g_ALMOST_FULL_THRESHOLD : positive;
-    g_RESET_SRC_DAT         : boolean  := true
+    g_RESET_SRC_DAT         : boolean  := true;
+    g_OP_PL_STG             : boolean  := false
   port (
     -- sink interface
@@ -67,12 +72,16 @@ architecture rtl of gn4124_axi_r_chl_dcfifo is
   signal wr_write       : std_logic;
   signal rd_empty       : std_logic;
-  signal src_valid_int  : std_logic;
   signal src_dat_clk_en : std_logic;
       -- TBD: Apply attributes on this to make it a clk_en for the regs
   signal rd_dat         : std_logic_vector(g_DATA_WIDTH -1 downto 0);
   signal almost_full    : std_logic;
+  signal int_dat     : std_logic_vector(g_DATA_WIDTH -1 downto 0);
+  signal int_vld     : std_logic;
+  signal int_rdy     : std_logic;
   snk_rdy_o <= not wr_full;
@@ -123,10 +132,10 @@ begin
     if rising_edge(src_clk_i) then
       if src_dat_clk_en = '1' then
-        src_dat_o <= rd_dat;
+        int_dat <= rd_dat;
       end if;
       if g_RESET_SRC_DAT and src_rst_n_i = '0' then
-        src_dat_o <= (others => '0');
+        int_dat <= (others => '0');
       end if;
     end if;
   end process p_dat;
@@ -136,26 +145,23 @@ begin
     if rising_edge(src_clk_i) then
       if src_rst_n_i = '0' then
-        src_valid_int <= '0';
+        int_vld <= '0';
         if src_dat_clk_en = '1' then
-          src_valid_int <= '1';
-        elsif src_rdy_i = '1' and src_valid_int = '1' then
-          src_valid_int <= '0';
+          int_vld <= '1';
+        elsif src_rdy_i = '1' and int_vld = '1' then
+          int_vld <= '0';
         end if;
       end if;
     end if;
   end process p_valid;
-  src_vld_o <= src_valid_int;
-  p_int_clken : process (src_valid_int, src_rdy_i, rd_empty)
+  p_int_clken : process (int_vld, int_rdy, rd_empty)
-    if src_valid_int = '1' and src_rdy_i = '1' and rd_empty = '0' then
+    if int_vld = '1' and int_rdy = '1' and rd_empty = '0' then
       src_dat_clk_en <= '1';
-    elsif src_valid_int = '0' and rd_empty = '0' then
+    elsif int_vld = '0' and rd_empty = '0' then
       src_dat_clk_en <= '1';
       src_dat_clk_en <= '0';
@@ -163,4 +169,21 @@ begin
   end process p_int_clken;
+  opt_pl : entity work.opt_rl0_pl_stg
+  generic map (
+    g_IMPLEMENT_PL_STG     => g_OP_PL_STG,  --: boolean  := ;
+    g_DATA_WIDTH           => g_DATA_WIDTH, --: positive := 1
+    g_MIMIMISE_TRANSITIONS => true          --: boolean  := false  -- when implementing a pl_stg controls if it is minimal logic OR "cleaner" for simulation and thus debugging
+  ) port map(
+    clk_i                  => src_clk_i,     --: in  std_logic;
+    rst_n_i                => src_rst_n_i,   --: in  std_logic;
+    in_rdy_o               => int_rdy,       --: out std_logic;
+    in_vld_i               => int_vld,       --: in  std_logic;
+    in_dat_i               => int_dat,       --: in std_logic_vector(g_DATA_WIDTH-1 downto 0);
+    out_rdy_i              => src_rdy_i,     --: in  std_logic;
+    out_vld_o              => src_vld_o,     --: out std_logic;
+    out_dat_o              => src_dat_o      --: out std_logic_vector(g_DATA_WIDTH-1 downto 0)
+  );
 end architecture rtl;
diff --git a/hdl/rtl/opt_rl0_pl_stg.vhd b/hdl/rtl/opt_rl0_pl_stg.vhd
new file mode 100644
index 0000000000000000000000000000000000000000..ea11f4b4b4edffc3eec47fe0c6d292ea0dda95df
--- /dev/null
+++ b/hdl/rtl/opt_rl0_pl_stg.vhd
@@ -0,0 +1,114 @@
+-- SPDX-FileCopyrightText: 2023 CERN (home.cern)
+-- SPDX-License-Identifier: CERN-OHL-W-2.0+
+-- GN4124 core for PCIe FMC carrier
+-- http://www.ohwr.org/projects/gn4124-core
+-- unit name:   opt_rl0_pl_stg
+-- description:  entity toigenerically switch in/out a AXI_ST or avalon_st RL0 pipeline stage.
+-- This source describes Open Hardware and is licensed under the CERN-OHL-W v2 or later.
+-- You may redistribute and modify this source and make products using it
+-- under the terms of the CERN-OHL-W v2 or future versions (https://ohwr.org/cern_ohl_w_v2.txt).
+-- This source is distributed WITHOUT ANY EXPRESS OR IMPLIED
+-- the CERN-OHL-W v2 for applicable conditions.
+-- Copyright CERN 2023
+library ieee;
+use ieee.std_logic_1164.all;
+entity opt_rl0_pl_stg is
+  generic (
+    g_IMPLEMENT_PL_STG     : boolean  := false;
+    g_DATA_WIDTH           : positive := 1;
+    g_MIMIMISE_TRANSITIONS : boolean  := false; -- when implementing a pl_stg controls if it is minimal logic OR "cleaner" for simulation and thus debugging
+    g_DAT_O_HAS_SRST       : boolean  := false
+  );
+  port (
+    clk_i                 : in  std_logic;
+    rst_n_i               : in  std_logic;
+    in_rdy_o              : out std_logic;
+    in_vld_i              : in  std_logic;
+    in_dat_i              : in std_logic_vector(g_DATA_WIDTH-1 downto 0);
+    out_rdy_i             : in  std_logic;
+    out_vld_o             : out std_logic;
+    out_dat_o             : out std_logic_vector(g_DATA_WIDTH-1 downto 0)
+  );
+end entity opt_rl0_pl_stg;
+architecture rtl of opt_rl0_pl_stg is
+  gen_pl : if g_IMPLEMENT_PL_STG generate
+    signal clk_en_pl_reg       : std_logic;
+    signal clk_en_op_reg       : std_logic;
+    signal use_dat_from_pl_reg : std_logic;
+    signal pl_dat              : std_logic_vector(g_DATA_WIDTH -1 downto 0);
+  begin
+    ctrl : entity work.teng_wr_nic_rl0_pl_stage_flowcontrol_srst
+    generic map (
+      g_minimal_op_transitions => g_MIMIMISE_TRANSITIONS
+    ) port map(
+      clk_i                    => clk_i,
+      rst_n_i                  => rst_n_i,
+      in_rdy_o                 => in_rdy_o,
+      in_vld_i                 => in_vld_i,
+      out_rdy_i                => out_rdy_i,
+      out_vld_o                => out_vld_o,
+      clk_en_pl_reg_o          => clk_en_pl_reg,
+      clk_en_op_reg_o          => clk_en_op_reg,
+      use_dat_from_pl_reg_o    => use_dat_from_pl_reg
+     );
+     p_pl_stage : process(clk_i) begin
+        if rising_edge(clk_i) then
+           if '1' = clk_en_pl_reg then
+              pl_dat  <= in_dat_i;
+           end if;
+           if '1' = clk_en_op_reg then
+              if '1' = use_dat_from_pl_reg then
+                out_dat_o <= pl_dat;
+              else
+                out_dat_o <= in_dat_i;
+              end if;
+           end if;
+           if (g_DAT_O_HAS_SRST = true) and (rst_n_i = '0') then
+             out_dat_o <= (others => '0');
+             pl_dat    <= (others => '0');
+           end if;
+        end if;
+     end process p_pl_stage;
+  end generate;
+  gen_no_pl : if g_IMPLEMENT_PL_STG = false generate
+  begin
+   in_rdy_o  <= out_rdy_i;
+   out_vld_o <= in_vld_i;
+   out_dat_o <= in_dat_i;
+  end generate;
+end architecture rtl;
diff --git a/hdl/rtl/p2l_axi4_rd_dc.vhd b/hdl/rtl/p2l_axi4_rd_dc.vhd
index 1826588b4cadb1822708f262ecdcb571bbcfd278..785ec49b8c9557d061a9454d6a506772dd56836d 100644
--- a/hdl/rtl/p2l_axi4_rd_dc.vhd
+++ b/hdl/rtl/p2l_axi4_rd_dc.vhd
@@ -62,7 +62,8 @@ entity p2l_axi4_rd_dc is
     g_AXI_AR_FIFO_WRDS       : positive                     := 16;    -- AXI4 AR Channel fifo depth
     g_AXI_R_FIFO_WRDS        : positive                     := 32;    -- AXI4  R Channel fifo depth
     g_EN_AXI_QOS_2_TC_LOOKUP : boolean                      := false; -- currently unused!
-    g_PCIE_DEFAULT_TC        : std_logic_vector(2 downto 0) := "000"  -- default PCIE TC value
+    g_PCIE_DEFAULT_TC        : std_logic_vector(2 downto 0) := "000"; -- default PCIE TC value
+    g_RD_AXI_GN_INIT_IP_PL   : boolean                      := true   -- put a fifo on the input to the intiator form the fifo..
   port (
     -- -------------------------------------------------------
@@ -282,9 +283,10 @@ begin
     gennum_master_axi_rd_init : entity work.p2l_axi4_rd_initiator
     generic map (
-       g_USE_AR_REGISTER_PL    => false,
        g_ID_WIDTH              => g_ID_WIDTH,
        g_RD_FIFO_W             => c_R_FIFO_FILL_LVL_W,
+       g_RD_FIFO_DEPTH         => g_AXI_R_FIFO_WRDS,
     ) port map (
        clk_i                   => clk_i,
@@ -326,42 +328,35 @@ begin
---       --  block to encapsulate DCFIFO and signals to enable AXI_STREAM signalling.
---      b_r_chl_fifo : block
---      begin
         -- Pack and unpack SLV for the fifo
-      r_fifo_wdat   <= int_r_rec_id & int_r_rec.last & int_r_rec.resp & int_r_rec.data;
-      r_chnl_data_o <= r_fifo_rdat(g_DATA_WIDTH -1 downto 0);
-      r_chnl_resp_o <= r_fifo_rdat(g_DATA_WIDTH+1 downto g_DATA_WIDTH);
-      r_chnl_last_o <= r_fifo_rdat(g_DATA_WIDTH+2);
-      r_chnl_id_o   <= r_fifo_rdat(r_fifo_rdat'high downto g_DATA_WIDTH+3);
-      r_chl_dcfifo : entity work.gn4124_axi_r_chl_dcfifo
-      generic map(
-        g_DATA_WIDTH            => c_R_CHL_FIFO_W,
-        g_NUM_WORDS             => g_AXI_R_FIFO_WRDS,
-        g_RESET_SRC_DAT         => TRUE
-      ) port map (
-        snk_clk_i               => clk_i,
-        snk_rst_n_i             => rst_n_i,
-        snk_rdy_o               => int_r_rdy,
-        snk_vld_i               => int_r_vld,
-        snk_dat_i               => r_fifo_wdat,
-        snk_lvl_o               => r_chl_dcfifo_fill,
-        snk_almost_full_o       => open,    -- here as an alternative to using the level myself
-        src_clk_i               => clk_axi_i,
-        src_rst_n_i             => rst_axi_n_i,
-        src_rdy_i               => r_ready_i,
-        src_vld_o               => r_valid_o,
-        src_dat_o               => r_fifo_rdat
-      );
---      end block b_r_chl_fifo;
---    end block b_axi_rd;
+    r_fifo_wdat   <= int_r_rec_id & int_r_rec.last & int_r_rec.resp & int_r_rec.data;
+    r_chnl_data_o <= r_fifo_rdat(g_DATA_WIDTH -1 downto 0);
+    r_chnl_resp_o <= r_fifo_rdat(g_DATA_WIDTH+1 downto g_DATA_WIDTH);
+    r_chnl_last_o <= r_fifo_rdat(g_DATA_WIDTH+2);
+    r_chnl_id_o   <= r_fifo_rdat(r_fifo_rdat'high downto g_DATA_WIDTH+3);
+    r_chl_dcfifo : entity work.gn4124_axi_r_chl_dcfifo
+    generic map(
+      g_DATA_WIDTH            => c_R_CHL_FIFO_W,
+      g_NUM_WORDS             => g_AXI_R_FIFO_WRDS,
+      g_RESET_SRC_DAT         => TRUE
+    ) port map (
+      snk_clk_i               => clk_i,
+      snk_rst_n_i             => rst_n_i,
+      snk_rdy_o               => int_r_rdy,
+      snk_vld_i               => int_r_vld,
+      snk_dat_i               => r_fifo_wdat,
+      snk_lvl_o               => r_chl_dcfifo_fill,
+      snk_almost_full_o       => open,    -- here as an alternative to using the level myself
+      src_clk_i               => clk_axi_i,
+      src_rst_n_i             => rst_axi_n_i,
+      src_rdy_i               => r_ready_i,
+      src_vld_o               => r_valid_o,
+      src_dat_o               => r_fifo_rdat
+    );
   end generate gen_axi_rd_subsystem;
 end architecture struct;
diff --git a/hdl/rtl/p2l_axi4_rd_initiator.vhd b/hdl/rtl/p2l_axi4_rd_initiator.vhd
index fd457fdf7a7936a1acaa601a73451a1b01c56bee..ddfae20cfe188a812ad0b50cfbd297c88b00fb20 100644
--- a/hdl/rtl/p2l_axi4_rd_initiator.vhd
+++ b/hdl/rtl/p2l_axi4_rd_initiator.vhd
@@ -44,6 +44,7 @@ entity p2l_axi4_rd_initiator is
     g_USE_AR_REGISTER_PL    : boolean  := false;
     g_ID_WIDTH              : positive := 2;
     g_RD_FIFO_W             : natural;
+    g_RD_FIFO_DEPTH         : positive;
     g_RD_FIFO_SPACE_CMP_VAL : positive
   port (
@@ -75,11 +76,11 @@ entity p2l_axi4_rd_initiator is
     -- currently not used..
     tid_gnt_i               : in  std_logic;
     tid_req_o               : out std_logic;
-    tid_val_i               : in  std_logic_Vector(1 downto 0); -- := g_TID_CID;
+    tid_val_i               : in  std_logic_vector(1 downto 0); -- := g_TID_CID;
     cid_val_o               : out std_logic_vector(1 downto 0);
     cid_fin_o               : out std_logic;
-    r_chl_dcfifo_fill_lvl_i : in  std_logic_Vector(g_RD_FIFO_W-1 downto 0);
+    r_chl_dcfifo_fill_lvl_i : in  std_logic_vector(g_RD_FIFO_W-1 downto 0);
     -- TODO: should make this generic based on the fifo size, or "merge" the fifo into this module...
     -- data pipe to push data from this Axi Read Master to the arbitrator..
@@ -147,7 +148,7 @@ architecture rtl of p2l_axi4_rd_initiator is
   signal cmd_length_in_words  : unsigned(9 downto 0);
-  signal outstanding_data     : unsigned ( 8 downto 0);
+  signal outstanding_data     : unsigned (8 downto 0);
   -- TBD to we add a outstanding_data_is_0 signal to help timing?
   signal rdata_fifo_has_space : std_logic;
@@ -172,6 +173,18 @@ architecture rtl of p2l_axi4_rd_initiator is
   signal sm_err_decode  : t_err;
   signal int_seen_error : std_logic;
+  signal fifo_space_bodge_count : unsigned(3 downto 0);
+  signal potential_op_fifo_depth : unsigned(g_RD_FIFO_W downto 0);
+  signal op_fifo_has_space : std_logic;
+--synthesis translate_off
+  signal sm_active_cycles          : natural;
+  signal sm_idle_backed_off_cycles : natural;
+--synthesis translate_on
@@ -270,7 +283,8 @@ begin
       case cmd_state is
         when ST_IDLE =>
-          if ar_sm_vld = '1' and ar_sm_rdy = '0' and rd_fifo_empty_duration_ok = '1' then
+          --if ar_sm_vld = '1' and ar_sm_rdy = '0' and rd_fifo_empty_duration_ok = '1' then  -- PC 1 MAY 2023 perf incr??
+          if ar_sm_vld = '1' and ar_sm_rdy = '0' and op_fifo_has_space = '1' then
             if ar_chl_sm.err = '1' then
               cmd_state     <= ST_ERR;
               err_push_last <= ar_chl_sm.r_last;
@@ -416,10 +430,61 @@ begin
   end process p_fsm;
+--synthesis translate_off
+  p_simonly_perf : process (clk_i)
+  begin
+    if rising_edge(clk_i) then
+      if cmd_state = ST_IDLE and ar_sm_vld = '1' and ar_sm_rdy = '0' then
+        sm_active_cycles <= sm_active_cycles + 1;
+        if rd_fifo_empty_duration_ok = '0' and op_fifo_has_space = '1' then
+          sm_idle_backed_off_cycles <= sm_idle_backed_off_cycles + 1;
+        end if;
+      elsif cmd_state /= ST_IDLE then
+        sm_active_cycles <= sm_active_cycles + 1;
+      end if;
+      if rst_n_i = '0' then
+        sm_idle_backed_off_cycles <= 0;
+        sm_active_cycles <= 0;
+      end if;
+    end if;
+  end process p_simonly_perf;
+--synthesis translate_on
+  p_fifo_space : process (clk_i)
+  begin
+    if rising_edge(clk_i) then
+      if cmd_state = ST_IDLE then
+        fifo_space_bodge_count <= (others => '1');
+      elsif fifo_space_bodge_count /= "0000" then
+        fifo_space_bodge_count <= fifo_space_bodge_count - 1;
+      end if;
+      potential_op_fifo_depth <= unsigned(r_chl_dcfifo_fill_lvl_i)
+                               + fifo_space_bodge_count
+                               + ar_chl_sm.len
+                               + to_unsigned(3, potential_op_fifo_depth'length);
+      if potential_op_fifo_depth < to_unsigned(g_RD_FIFO_DEPTH, potential_op_fifo_depth'length) then
+        op_fifo_has_space <= '1';
+      else
+        op_fifo_has_space <= '0';
+      end if;
+      if rst_n_i = '0' then
+        fifo_space_bodge_count <= (others => '0');
+      end if;
+    end if;
+  end process p_fifo_space;
   -- NOTE: we need to be really carefull here the value g_RD_FIFO_SPACE_CMP_VAL used in the
   --       comparison. it is a bit of a "magic" number...
   --       Ideally we would use an axi style FIFO that had "ready" (full almost full etc)
-  --       synchronous to the value of fil level, but that is not to the authors knowledge availablein the general-cores lib.
+  --       synchronous to the value of fill level, but that is not to the authors knowledge
+  --       available in the general-cores lib.
   p_ready : process (clk_i)
     if rising_edge(clk_i) then
@@ -450,7 +515,7 @@ begin
       if rst_n_i = '0' then
         rd_fifo_empty_cycle_count <= (others => '1');
-                       -- 7 cycles of being empty before we can get ready to issue the next command
+                     -- 7 cycles of being empty before we can get ready to issue the next command
         rd_fifo_empty_duration_ok <= '0';
       end if;
     end if;