diff --git a/hdl/rtl/p2l_dma_master.vhd b/hdl/rtl/p2l_dma_master.vhd index 9841d9f1545f2407cad72f5a8aa6d362e3d4253d..29a616da6419ae49019102e5c0bb7289006662d2 100644 --- a/hdl/rtl/p2l_dma_master.vhd +++ b/hdl/rtl/p2l_dma_master.vhd @@ -15,10 +15,14 @@ -- waiting for the WB side to write all data. This can be a problem, especially -- when the WB side is slower and the next DMA transaction is a read which will -- switch the WB signals to the L2P module, thus cutting the previous WB write --- transfer from this module in the middle. +-- transfer from this module in the middle. ***** HOPEFULLY THIS IS NOW FIXED!!! +-- +-- The fix will hurt performance, Ideally we would seperately signal PCIE_COMPLETE +-- and WB_COMPLETE in that way the controller can mask this impact in some sequences +-- of transactions. -- -------------------------------------------------------------------------------- --- Copyright CERN 2010-2020 +-- Copyright CERN 2010-2021 -------------------------------------------------------------------------------- -- Copyright and related rights are licensed under the Solderpad Hardware -- License, Version 2.0 (the "License"); you may not use this file except @@ -42,8 +46,10 @@ use work.genram_pkg.all; entity p2l_dma_master is generic ( - g_FIFO_SIZE : positive := 64; - g_BYTE_SWAP : boolean := FALSE); + g_FIFO_SIZE : positive := 64; + g_BYTE_SWAP : boolean := FALSE; + g_TID_CID : std_logic_vector(1 downto 0) := "01" + ); port ( --------------------------------------------------------- -- GN4124 core clock and reset @@ -127,6 +133,8 @@ architecture arch of p2l_dma_master is -- c_MAX_READ_REQ_SIZE is the maximum size (in 32-bit words) of the payload of a packet. -- Allowed c_MAX_READ_REQ_SIZE values are: 32, 64, 128, 256, 512, 1024. -- This constant must be set according to the GN4124 and motherboard chipset capabilities. + -- + -- Or should the driver read the value configured in the gennum, and sets it in the DMA_CSR constant c_MAX_READ_REQ_SIZE : unsigned(10 downto 0) := to_unsigned(1024, 11); ----------------------------------------------------------------------------- @@ -162,9 +170,9 @@ architecture arch of p2l_dma_master is signal to_wb_fifo_rd : std_logic; signal to_wb_fifo_wr : std_logic; signal to_wb_fifo_wr_d : std_logic; - signal to_wb_fifo_din : std_logic_vector(63 downto 0) := (others => '0'); - signal to_wb_fifo_din_d : std_logic_vector(63 downto 0) := (others => '0'); - signal to_wb_fifo_dout : std_logic_vector(63 downto 0); + signal to_wb_fifo_din : std_logic_vector(62 downto 0) := (others => '0'); + signal to_wb_fifo_din_d : std_logic_vector(62 downto 0) := (others => '0'); + signal to_wb_fifo_dout : std_logic_vector(62 downto 0); signal to_wb_fifo_byte_swap : std_logic_vector(1 downto 0) := (others => '0'); -- wishbone @@ -173,7 +181,8 @@ architecture arch of p2l_dma_master is signal wb_dma_tfr : boolean; -- P2L DMA read request FSM - type p2l_dma_state_type is (P2L_IDLE, P2L_SETUP, P2L_HEADER, P2L_ADDR_H, P2L_ADDR_L, P2L_WAIT_READ_COMPLETION); + type p2l_dma_state_type is (P2L_IDLE, P2L_SETUP, P2L_HEADER, P2L_ADDR_H, P2L_ADDR_L, + P2L_WAIT_READ_COMPLETION, P2L_WAIT_WISHBONE_COMPLETE); signal p2l_dma_current_state : p2l_dma_state_type; signal p2l_data_cnt : unsigned(10 downto 0) := (others => '0'); @@ -188,6 +197,12 @@ architecture arch of p2l_dma_master is signal to_wb_fifo_full_d : std_logic_vector(c_SYNC_FIFO_FULL_DELAY - 1 downto 0) := (others => '0'); signal to_wb_fifo_full_next : std_logic; + + signal fsm_wb_reading_complete : std_logic; + signal wb_sent_all_data : std_logic := '0'; + signal wb_wdat_is_last : std_logic; + + begin -- Errors to DMA controller dma_ctrl_error_o <= dma_busy_error or completion_error; @@ -200,7 +215,7 @@ begin p_read_req_fsm : process (clk_i) begin if rising_edge(clk_i) then - if rst_n_i = '0' then + if rst_n_i = '0' then -- some registers are using rst_n_i as a clk_en p2l_dma_current_state <= P2L_IDLE; pdm_arb_req_o <= '0'; pdm_arb_valid_o <= '0'; @@ -247,6 +262,7 @@ begin else l2p_len_header <= l2p_len_cnt(10 downto 0); end if; + -- request access to PCIe bus pdm_arb_req_o <= '1'; p2l_dma_current_state <= P2L_HEADER; @@ -262,14 +278,14 @@ begin pdm_arb_data(27 downto 24) <= "000" & l2p_64b_address; --> Packet type = read req (32 or 64) if l2p_len_header = 1 then -- Last Byte Enable must be "0000" when length = 1 - pdm_arb_data(23 downto 20) <= "0000"; --> LBE (Last Byte Enable) + pdm_arb_data(23 downto 20) <= "0000"; --> LBE (Last Byte Enable) else - pdm_arb_data(23 downto 20) <= "1111"; --> LBE (Last Byte Enable) + pdm_arb_data(23 downto 20) <= "1111"; --> LBE (Last Byte Enable) end if; - pdm_arb_data(19 downto 16) <= "1111"; --> FBE (First Byte Enable) - pdm_arb_data(15 downto 13) <= "111"; --> Reserved - pdm_arb_data(12) <= '0'; --> VC (Virtual Channel) - pdm_arb_data(11 downto 10) <= "01"; --> CID + pdm_arb_data(19 downto 16) <= "1111"; --> FBE (First Byte Enable) + pdm_arb_data(15 downto 13) <= "111"; --> Reserved + pdm_arb_data(12) <= '0'; --> VC (Virtual Channel) + pdm_arb_data(11 downto 10) <= g_TID_CID; --> CID pdm_arb_data(9 downto 0) <= std_logic_vector(l2p_len_header (9 downto 0)); --> Length (in words) pdm_arb_valid_o <= '1'; pdm_arb_dframe_o <= '1'; @@ -297,12 +313,14 @@ begin l2p_len_cnt <= l2p_len_cnt - l2p_len_header; l2p_address_l <= std_logic_vector(unsigned(l2p_address_l) + 4*l2p_len_header); p2l_dma_current_state <= P2L_WAIT_READ_COMPLETION; - + when P2L_WAIT_READ_COMPLETION => -- End of the read request packet pdm_arb_valid_o <= '0'; - - if pd_pdm_data_valid_i = '1' and pd_pdm_master_cpld_i = '1' then + + if pd_pdm_data_valid_i = '1' + and pd_pdm_master_cpld_i = '1' + and pd_pdm_hdr_cid_i = g_TID_CID then -- Received a word. l2p_len_header <= l2p_len_header - 1; end if; @@ -310,8 +328,10 @@ begin if dma_ctrl_abort_i = '1' then rx_error_t <= '1'; p2l_dma_current_state <= P2L_IDLE; - elsif pd_pdm_master_cpld_i = '1' and pd_pdm_data_last_i = '1' + elsif pd_pdm_master_cpld_i = '1' + and pd_pdm_data_last_i = '1' and l2p_len_header = 1 + and pd_pdm_hdr_cid_i = g_TID_CID then -- Note: a read request may result in multiple read completion. -- last word of read completion has been received @@ -321,18 +341,30 @@ begin else -- indicate end of DMA transfer if is_next_item = '1' then - next_item_valid_o <= '1'; + next_item_valid_o <= '1'; + p2l_dma_current_state <= P2L_IDLE; else - dma_ctrl_done_t <= '1'; + p2l_dma_current_state <= P2L_WAIT_WISHBONE_COMPLETE; + --dma_ctrl_done_t <= '1'; end if; - p2l_dma_current_state <= P2L_IDLE; + end if; - elsif pd_pdm_master_cpln_i = '1' then + elsif pd_pdm_master_cpln_i = '1' + and pd_pdm_hdr_cid_i = g_TID_CID + then -- should not return a read completion without data completion_error <= '1'; - p2l_dma_current_state <= P2L_IDLE; + p2l_dma_current_state <= P2L_IDLE; -- TBD error handling, and all wishbone_dat_done.. end if; + when P2L_WAIT_WISHBONE_COMPLETE => + if dma_ctrl_abort_i = '1' then + p2l_dma_current_state <= P2L_IDLE; + elsif fsm_wb_reading_complete = '1' then + dma_ctrl_done_t <= '1'; + p2l_dma_current_state <= P2L_IDLE; + end if; + when others => p2l_dma_current_state <= P2L_IDLE; pdm_arb_req_o <= '0'; @@ -372,9 +404,10 @@ begin if p2l_dma_current_state = P2L_ADDR_L then -- Store number of 32-bit data words to be received for the current read request p2l_data_cnt <= l2p_len_header; - elsif p2l_dma_current_state = P2L_WAIT_READ_COMPLETION - and pd_pdm_data_valid_i = '1' + elsif p2l_dma_current_state = P2L_WAIT_READ_COMPLETION + and pd_pdm_data_valid_i = '1' and pd_pdm_master_cpld_i = '1' + and pd_pdm_hdr_cid_i = g_TID_CID then -- decrement number of data to be received p2l_data_cnt <= p2l_data_cnt - 1; @@ -388,8 +421,10 @@ begin p_next_item : process (clk_i) begin if rising_edge(clk_i) then - if p2l_dma_current_state = P2L_WAIT_READ_COMPLETION - and is_next_item = '1' and pd_pdm_data_valid_i = '1' + if p2l_dma_current_state = P2L_WAIT_READ_COMPLETION + and is_next_item = '1' + and pd_pdm_data_valid_i = '1' + and pd_pdm_hdr_cid_i = g_TID_CID then -- next item data are supposed to be received in the right order !! case p2l_data_cnt(2 downto 0) is @@ -400,7 +435,7 @@ begin when "101" => next_item_host_addr_h <= pd_pdm_data_i; when "100" => - next_item_len <= pd_pdm_data_i; + next_item_len <= pd_pdm_data_i; when "011" => next_item_next_l <= pd_pdm_data_i; when "010" => @@ -428,7 +463,7 @@ begin p_addr_cnt : process (clk_i) begin if rising_edge(clk_i) then - if rst_n_i = '0' then + if rst_n_i = '0' then -- there are definitely registers using rst_n_i as a clk_en in this process dma_busy_error <= '0'; to_wb_fifo_wr <= '0'; to_wb_fifo_wr_d <= '0'; @@ -436,6 +471,12 @@ begin to_wb_fifo_din_d <= to_wb_fifo_din; to_wb_fifo_wr_d <= to_wb_fifo_wr; + if p2l_dma_current_state /= P2L_IDLE and dma_ctrl_start_p2l_i = '1' then + dma_busy_error <= '1'; + else + dma_busy_error <= '0'; + end if; + if dma_ctrl_start_p2l_i = '1' then if p2l_dma_current_state = P2L_IDLE then -- dma_ctrl_target_addr_i is a byte address and target_addr_cnt is a @@ -443,11 +484,11 @@ begin target_addr_cnt <= unsigned(dma_ctrl_carrier_addr_i(31 downto 2)); -- stores byte swap info for the current DMA transfer to_wb_fifo_byte_swap <= dma_ctrl_byte_swap_i; - else - dma_busy_error <= '1'; end if; - elsif p2l_dma_current_state = P2L_WAIT_READ_COMPLETION - and is_next_item = '0' and pd_pdm_data_valid_i = '1' + elsif p2l_dma_current_state = P2L_WAIT_READ_COMPLETION + and is_next_item = '0' + and pd_pdm_data_valid_i = '1' + and pd_pdm_hdr_cid_i = g_TID_CID then -- write target address and data to the sync fifo to_wb_fifo_wr <= '1'; @@ -455,8 +496,17 @@ begin to_wb_fifo_din(61 downto 32) <= std_logic_vector(target_addr_cnt); -- increment target address counter target_addr_cnt <= target_addr_cnt + 1; + + -- indicate that this is the last "beat" of the transfer so we can control the switch when the wishbone interface is done! + if l2p_len_cnt = 0 and pd_pdm_data_last_i = '1' and l2p_len_header = 1 then + to_wb_fifo_din(62) <= '1'; + else + to_wb_fifo_din(62) <= '0'; + end if; + + -- NOTE we dont do anything for error conditions, Assumption DMA controller stops.. + else - dma_busy_error <= '0'; to_wb_fifo_wr <= '0'; end if; end if; @@ -497,32 +547,37 @@ begin to_wb_fifo_full <= to_wb_fifo_full_d(to_wb_fifo_full_d'high); + -- TBD: should we monitor the fill level: e.g. dont issue a write until there is space in the + -- receiving fifo to avoid backpressure on the bus cmp_to_wb_fifo : generic_async_fifo_dual_rst generic map ( - g_DATA_WIDTH => 64, + g_DATA_WIDTH => 63, g_SIZE => g_FIFO_SIZE, g_SHOW_AHEAD => TRUE, g_WITH_WR_FULL => FALSE, g_WITH_WR_ALMOST_FULL => TRUE, -- 20 less to give time to the GN4124 to react to P2L_RDY going low. - g_ALMOST_FULL_THRESHOLD => g_FIFO_SIZE - c_SYNC_FIFO_FULL_DELAY - 20) - port map ( + g_ALMOST_FULL_THRESHOLD => g_FIFO_SIZE - c_SYNC_FIFO_FULL_DELAY - 20 + ) port map ( -- write port rst_wr_n_i => fifo_rst_n, clk_wr_i => clk_i, d_i => to_wb_fifo_din_d, we_i => to_wb_fifo_wr_d, wr_almost_full_o => to_wb_fifo_full_next, + wr_count_o => open, -- read port rst_rd_n_i => wb_fifo_rst_n, clk_rd_i => wb_dma_clk_i, q_o => to_wb_fifo_dout, rd_i => to_wb_fifo_rd, - rd_empty_o => to_wb_fifo_empty); + rd_empty_o => to_wb_fifo_empty + ); -- pause transfer from GN4124 if fifo is (almost) full p2l_rdy_o <= not(to_wb_fifo_full); + ------------------------------------------------------------------------------ -- Wishbone master (write only) ------------------------------------------------------------------------------ @@ -545,43 +600,74 @@ begin p_wb_master : process (wb_dma_clk_i) begin if rising_edge(wb_dma_clk_i) then - if wb_fifo_rst_n = '0' then - wb_dma_o.cyc <= '0'; - wb_dma_out_stb <= '0'; - wb_ack_cnt <= (others => '0'); - else - if to_wb_fifo_rd = '1' then - -- Data available, read them from the fifo. - wb_dma_o.adr(31 downto 30) <= "00"; - wb_dma_o.adr(29 downto 0) <= to_wb_fifo_dout(61 downto 32); - wb_dma_o.dat <= to_wb_fifo_dout(31 downto 0); - - -- Data/addresses are valid when fifo was just read. - wb_dma_out_stb <= '1'; - wb_dma_o.cyc <= '1'; - else - -- No read. - if wb_dma_out_stb = '1' and wb_dma_i.stall = '1' then - -- Data were not read, just wait. - null; - elsif to_wb_fifo_empty = '1' then - -- No more data to produce. - wb_dma_out_stb <= '0'; + wb_sent_all_data <= '0'; + if to_wb_fifo_rd = '1' then + -- Data available, read them from the fifo. + wb_dma_o.adr(31 downto 30) <= "00"; + wb_dma_o.adr(29 downto 0) <= to_wb_fifo_dout(61 downto 32); + wb_dma_o.dat <= to_wb_fifo_dout(31 downto 0); + + + -- use to_wb_fifo_dout(62) to flag the last trasnfer!!! track it and send it back to the SM to indicate done! + wb_wdat_is_last <= to_wb_fifo_dout(62); - if wb_ack_cnt = 0 then - -- End of the burst - wb_dma_o.cyc <= '0'; + -- Data/addresses are valid when fifo was just read. + wb_dma_out_stb <= '1'; + wb_dma_o.cyc <= '1'; + else + -- No read. + if wb_dma_out_stb = '1' and wb_dma_i.stall = '1' then + -- Data were not read, just wait. + null; + elsif to_wb_fifo_empty = '1' then + -- No more data to produce. + wb_dma_out_stb <= '0'; + + if wb_ack_cnt = 0 then + -- End of the burst + wb_dma_o.cyc <= '0'; + + if wb_wdat_is_last = '1' then + wb_sent_all_data <= '1'; + wb_wdat_is_last <= '0'; end if; + end if; end if; + end if; - -- Track number of expected ack. - if wb_dma_tfr and wb_dma_i.ack = '0' then - wb_ack_cnt <= wb_ack_cnt + 1; - elsif not wb_dma_tfr and wb_dma_i.ack = '1' then - wb_ack_cnt <= wb_ack_cnt - 1; - end if; + -- Track number of expected ack. + if wb_dma_tfr and wb_dma_i.ack = '0' then + wb_ack_cnt <= wb_ack_cnt + 1; + elsif not wb_dma_tfr and wb_dma_i.ack = '1' then + wb_ack_cnt <= wb_ack_cnt - 1; + end if; + + -- we dont want to sync reset all signals and not wanting to infer a clk_en we do it like this. + if wb_fifo_rst_n = '0' then + wb_dma_o.cyc <= '0'; + wb_dma_out_stb <= '0'; + wb_ack_cnt <= (others => '0'); end if; + end if; end process p_wb_master; + + + -- TBD: should we use the flowcontrol aspects of this to backpressure WB? + -- That would allow a counter of outstanding "transactions" to count off so + -- we know they are all completed: + -- e.g. separate pcie_done_o and wb_done_o signals to the controller. + pulse_sync_wb_done : entity work.gc_pulse_synchronizer2 + port map ( + clk_in_i => wb_dma_clk_i, + rst_in_n_i => wb_fifo_rst_n, + clk_out_i => clk_i, + rst_out_n_i => fifo_rst_n, + d_ready_o => open, + d_ack_p_o => open, + d_p_i => wb_sent_all_data, + q_p_o => fsm_wb_reading_complete + ); + end architecture arch;