From 4aa109296b57330e208cd53919465e5087e325ce Mon Sep 17 00:00:00 2001 From: Tristan Gingold <tristan.gingold@cern.ch> Date: Wed, 4 May 2022 11:37:13 +0200 Subject: [PATCH] Add dram ECC tests, dram scrubber disabled at start --- hdl/rtl/hydra_core.vhd | 5 ++- hdl/rtl/hydra_dram.vhd | 35 +++++++++++---------- hdl/rtl/hydra_iram.vhd | 28 ++++++++--------- hdl/rtl/hydra_supervisor_regs.cheby | 5 +++ hdl/rtl/hydra_supervisor_regs.vhd | 49 ++++++++++++++++++++++++----- sw/include/hydra_supervisor_regs.h | 27 ++++++++++------ sw/sf2-test/main.c | 46 ++++++++++++++++++++++----- 7 files changed, 139 insertions(+), 56 deletions(-) diff --git a/hdl/rtl/hydra_core.vhd b/hdl/rtl/hydra_core.vhd index 7ee1a1d..3000c8b 100644 --- a/hdl/rtl/hydra_core.vhd +++ b/hdl/rtl/hydra_core.vhd @@ -113,6 +113,7 @@ architecture arch of hydra_core is signal dm_mem_rdata, dm_wb_rdata : std_logic_vector(31 downto 0); signal dm_wb_write, dm_select_wb, dm_select_ram : std_logic; signal dm_done, dm_err : std_logic; + signal dram_ecc_mask : std_logic_vector(7 downto 0); signal dwb_out : t_wishbone_master_out; @@ -263,12 +264,13 @@ begin port map ( clk_i => clk_sys_i, rst_n_i => rst_n_i, - cpu_rst_n_i => cpu_rst_n_i, + scrub_en_i => dram_scrub_en, addr_i => reg_dm_addr(g_DRAM_LOG_SIZE - 1 downto 2), en_i => reg_dm_en, we_i => reg_dm_store, sel_i => reg_dm_data_select, data_i => reg_dm_data_s, + eccm_i => dram_ecc_mask(6 downto 0), data_o => dm_mem_rdata, done_o => dm_done, err_o => dm_err, @@ -524,6 +526,7 @@ begin dram_scrub_de_i => nbr_dram_scrub_uncorr, cpu_data_err_i => nbr_cpu_data_err, cpu_iaddr_err_i => nbr_cpu_iaddr_err, + dram_ecc_mask_o => dram_ecc_mask, scrub_cfg_iram_en_i => iram_scrub_en, scrub_cfg_iram_en_o => scrub_cfg_iram_val, scrub_cfg_dram_en_i => dram_scrub_en, diff --git a/hdl/rtl/hydra_dram.vhd b/hdl/rtl/hydra_dram.vhd index 63d7a7d..405f08d 100644 --- a/hdl/rtl/hydra_dram.vhd +++ b/hdl/rtl/hydra_dram.vhd @@ -37,7 +37,9 @@ entity hydra_dram is -- Note: only writes are allowed during reset. clk_i : in std_logic; rst_n_i : in std_logic; - cpu_rst_n_i : in std_logic; + + -- Enable scrubber + scrub_en_i : in std_logic; -- Read port -- RDONE_O is a pulse. @@ -47,6 +49,7 @@ entity hydra_dram is sel_i : in std_logic_vector(3 downto 0); data_i : in std_logic_vector(31 downto 0); data_o : out std_logic_vector(31 downto 0); + eccm_i : in std_logic_vector(6 downto 0); done_o : out std_logic; err_o : out std_logic; @@ -128,7 +131,8 @@ begin rerr <= f_ecc_errors(rsyndrome) and ren_d; rerr_one <= f_ecc_one_error(rsyndrome) and ren_d; - err_o <= rerr; + -- Only single errors are reported, double errors are fatal. + err_o <= rerr_one; done_o <= r_done or w_done; p_scrub: process (clk_i) @@ -139,11 +143,11 @@ begin scrub_addr <= (others => '0'); scrub_rd <= '0'; else - if scrub_done = '1' and rerr = '0' then + if scrub_done = '1' and rerr_one = '0' then scrub_counter <= unsigned(scrubber_period_i); scrub_addr <= std_logic_vector(unsigned(scrub_addr) + 1); scrub_rd <= '0'; - else + elsif scrub_en_i = '1' then if scrub_counter = (scrub_counter'range => '0') then scrub_rd <= '1'; else @@ -158,7 +162,7 @@ begin p_done, r_done, scrub_done, rerr, rerr_one, rdata_ecc, rsyndrome, last_addr, last_we, last_sel, recc, en_i, addr_i, we_i, data_i, sel_i, r_done_d, - scrub_rd, scrub_addr, scrub_done_d, cpu_rst_n_i) + scrub_rd, scrub_addr, scrub_done_d) is variable d : std_logic_vector(31 downto 0); begin @@ -185,23 +189,22 @@ begin if (p_done = '1' or r_done = '1' or scrub_done = '1') and rerr = '1' then -- There was an error on the last access. -- Write to fix it. + addr <= last_addr; + n_last_addr <= last_addr; + n_last_we <= last_we; + n_last_sel <= last_sel; if rerr_one = '1' then -- Correctable: correct it. n_scrub_se <= scrub_done; n_ecc_one <= p_done or r_done; wdata_ecc <= f_fix_error(rsyndrome, rdata_ecc(38 downto 32), rdata_ecc(31 downto 0)); + wen <= '1'; + n_state <= S_REWRITE; else - -- Uncorrectable. Just recompute the ECC to be able to continue. + -- Uncorrectable. Do not fix it. n_scrub_de <= scrub_done; n_ecc_fatal <= p_done or r_done; - wdata_ecc <= recc & rdata_ecc(31 downto 0); end if; - addr <= last_addr; - wen <= '1'; - n_last_addr <= last_addr; - n_last_we <= last_we; - n_last_sel <= last_sel; - n_state <= S_REWRITE; elsif last_we = '1' then assert last_sel /= "1111"; -- The partial write; @@ -230,7 +233,7 @@ begin if sel_i = "1111" then -- Full write wen <= '1'; - wdata_ecc <= f_calc_ecc (data_i) & data_i; + wdata_ecc <= (f_calc_ecc (data_i) xor eccm_i) & data_i; n_w_done <= '1'; else -- Partial write: first read the word @@ -244,8 +247,8 @@ begin ren <= '1'; n_r_done <= '1'; end if; - elsif cpu_rst_n_i = '1' then - -- scrub if idle (but not during reset) + elsif scrub_en_i = '1' then + -- scrub if idle addr <= scrub_addr; ren <= '1'; n_scrub_done <= '1'; diff --git a/hdl/rtl/hydra_iram.vhd b/hdl/rtl/hydra_iram.vhd index 9769951..8092620 100644 --- a/hdl/rtl/hydra_iram.vhd +++ b/hdl/rtl/hydra_iram.vhd @@ -94,7 +94,7 @@ architecture arch of hydra_iram is signal last_raddr, n_last_raddr, scrub_addr : std_logic_vector(g_RAM_LOG_SIZE - 1 downto 2); signal scrub_counter : unsigned(15 downto 0); - signal scrub_rd, scrub_done, scrub_done_d, n_scrub_done : std_logic; + signal scrub_rd, scrub_done, n_scrub_done : std_logic; signal n_scrub_se, n_scrub_de : std_logic; type state_t is (S_READ, S_REWRITE); @@ -142,9 +142,10 @@ begin rerr <= f_ecc_errors(rsyndrome) and ren_d; rerr_one <= f_ecc_one_error(rsyndrome) and ren_d; - r1_err_o <= rerr; + -- Note: only single errors are reported, double errors are fatal. + r1_err_o <= rerr_one; r1_done_o <= r1_done; - r2_err_o <= rerr; + r2_err_o <= rerr_one; r2_done_o <= r2_done; wdone_o <= wdone; @@ -156,7 +157,7 @@ begin scrub_addr <= (others => '0'); scrub_rd <= '0'; else - if scrub_done = '1' and rerr = '0' then + if n_scrub_done = '1' then scrub_counter <= unsigned(scrubber_period_i); scrub_addr <= std_logic_vector(unsigned(scrub_addr) + 1); scrub_rd <= '0'; @@ -173,7 +174,7 @@ begin p_ctrl: process (state, r1_done, r2_done, rerr, rerr_one, rdata_ecc, rsyndrome, last_raddr, recc, r1_en_i, r1_addr_i, we_i, waddr_i, wdata_i, wecc_i, r1_done_d, r2_done_d, - r2_en_i, r2_addr_i, scrub_rd, scrub_addr, scrub_done, scrub_done_d, scrub_en_i) + r2_en_i, r2_addr_i, scrub_rd, scrub_addr, scrub_done, scrub_en_i) begin wen <= '0'; waddr <= (others => 'X'); @@ -197,21 +198,21 @@ begin if (r2_done = '1' or r1_done = '1' or scrub_done = '1') and rerr = '1' then -- There was an error on the last access. -- Write to fix it. + wdata_ecc <= f_fix_error(rsyndrome, rdata_ecc(38 downto 32), rdata_ecc(31 downto 0)); + waddr <= last_raddr; + n_last_raddr <= last_raddr; if rerr_one = '1' then -- Correctable: correct it. n_scrub_se <= scrub_done; n_ecc_one <= not scrub_done; - wdata_ecc <= f_fix_error(rsyndrome, rdata_ecc(38 downto 32), rdata_ecc(31 downto 0)); + wen <= '1'; + n_state <= S_REWRITE; else - -- Uncorrectable. Just recompute the ECC to be able to continue. + -- Uncorrectable. Do not fix. Either it is a fatal error, or it will be when + -- accessed by the cpu. A double-error detected by the scrubber is not fatal. n_scrub_de <= scrub_done; n_ecc_fatal <= not scrub_done; - wdata_ecc <= recc & rdata_ecc(31 downto 0); end if; - waddr <= last_raddr; - wen <= '1'; - n_last_raddr <= last_raddr; - n_state <= S_REWRITE; elsif we_i = '1' then -- Write waddr <= waddr_i; @@ -251,7 +252,6 @@ begin ren <= '1'; n_r1_done <= r1_done_d; n_r2_done <= r2_done_d; - n_scrub_done <= scrub_done_d; n_last_raddr <= last_raddr; n_state <= S_READ; end case; @@ -266,7 +266,6 @@ begin r2_done <= '0'; r2_done_d <= '0'; scrub_done <= '0'; - scrub_done_d <= '0'; state <= S_READ; last_raddr <= (others => 'X'); @@ -279,7 +278,6 @@ begin r1_done <= n_r1_done; r2_done_d <= r2_done; r2_done <= n_r2_done; - scrub_done_d <= scrub_done; scrub_done <= n_scrub_done; state <= n_state; diff --git a/hdl/rtl/hydra_supervisor_regs.cheby b/hdl/rtl/hydra_supervisor_regs.cheby index 31220e5..857d890 100644 --- a/hdl/rtl/hydra_supervisor_regs.cheby +++ b/hdl/rtl/hydra_supervisor_regs.cheby @@ -107,6 +107,11 @@ memory-map: description: Number of CPU errors on instruction bus width: 32 access: ro + - reg: + name: dram_ecc_mask + description: Change ECC bits on dram writes for testing + width: 8 + access: rw - reg: name: scrub_cfg description: configuration of scrubbers diff --git a/hdl/rtl/hydra_supervisor_regs.vhd b/hdl/rtl/hydra_supervisor_regs.vhd index 89b5d9c..8f917a2 100644 --- a/hdl/rtl/hydra_supervisor_regs.vhd +++ b/hdl/rtl/hydra_supervisor_regs.vhd @@ -71,6 +71,9 @@ entity hydra_supervisor_regs is -- Number of CPU errors on instruction bus cpu_iaddr_err_i : in std_logic_vector(31 downto 0); + -- Change ECC bits on dram writes for testing + dram_ecc_mask_o : out std_logic_vector(7 downto 0); + -- configuration of scrubbers -- Write 1 to enable iram scrubber scrub_cfg_iram_en_i : in std_logic; @@ -103,6 +106,9 @@ architecture syn of hydra_supervisor_regs is signal wd_key_reg : std_logic_vector(31 downto 0); signal wd_key_wreq : std_logic; signal wd_key_wack : std_logic; + signal dram_ecc_mask_reg : std_logic_vector(7 downto 0); + signal dram_ecc_mask_wreq : std_logic; + signal dram_ecc_mask_wack : std_logic; signal scrub_cfg_wreq : std_logic; signal iram_scrub_period_reg : std_logic_vector(15 downto 0); signal iram_scrub_period_wreq : std_logic; @@ -213,6 +219,22 @@ begin -- Register cpu_iaddr_err + -- Register dram_ecc_mask + dram_ecc_mask_o <= dram_ecc_mask_reg; + process (clk_i) begin + if rising_edge(clk_i) then + if rst_n_i = '0' then + dram_ecc_mask_reg <= "00000000"; + dram_ecc_mask_wack <= '0'; + else + if dram_ecc_mask_wreq = '1' then + dram_ecc_mask_reg <= wr_dat_d0(7 downto 0); + end if; + dram_ecc_mask_wack <= dram_ecc_mask_wreq; + end if; + end if; + end process; + -- Register scrub_cfg scrub_cfg_iram_en_o <= wr_dat_d0(0); scrub_cfg_dram_en_o <= wr_dat_d0(1); @@ -251,11 +273,12 @@ begin end process; -- Process for write requests. - process (wr_adr_d0, wr_req_d0, wd_key_wack, iram_scrub_period_wack, - dram_scrub_period_wack) begin + process (wr_adr_d0, wr_req_d0, wd_key_wack, dram_ecc_mask_wack, + iram_scrub_period_wack, dram_scrub_period_wack) begin cpu_wreq <= '0'; wd_period_wreq <= '0'; wd_key_wreq <= '0'; + dram_ecc_mask_wreq <= '0'; scrub_cfg_wreq <= '0'; iram_scrub_period_wreq <= '0'; dram_scrub_period_wreq <= '0'; @@ -306,14 +329,18 @@ begin -- Reg cpu_iaddr_err wr_ack_int <= wr_req_d0; when "01110" => + -- Reg dram_ecc_mask + dram_ecc_mask_wreq <= wr_req_d0; + wr_ack_int <= dram_ecc_mask_wack; + when "01111" => -- Reg scrub_cfg scrub_cfg_wreq <= wr_req_d0; wr_ack_int <= wr_req_d0; - when "01111" => + when "10000" => -- Reg iram_scrub_period iram_scrub_period_wreq <= wr_req_d0; wr_ack_int <= iram_scrub_period_wack; - when "10000" => + when "10001" => -- Reg dram_scrub_period dram_scrub_period_wreq <= wr_req_d0; wr_ack_int <= dram_scrub_period_wack; @@ -327,8 +354,9 @@ begin reset_cause_watchdog_i, cpu_reset_i, cpu_recovery_i, force_divergence_i, wd_period_i, wd_count_i, iram_ecc_se_i, iram_scrub_se_i, iram_scrub_de_i, dram_ecc_se_i, dram_scrub_se_i, - dram_scrub_de_i, cpu_data_err_i, cpu_iaddr_err_i, scrub_cfg_iram_en_i, - scrub_cfg_dram_en_i, iram_scrub_period_reg, dram_scrub_period_reg) begin + dram_scrub_de_i, cpu_data_err_i, cpu_iaddr_err_i, dram_ecc_mask_reg, + scrub_cfg_iram_en_i, scrub_cfg_dram_en_i, iram_scrub_period_reg, + dram_scrub_period_reg) begin -- By default ack read requests rd_dat_d0 <= (others => 'X'); force_divergence_rd_o <= '0'; @@ -396,17 +424,22 @@ begin rd_ack_d0 <= rd_req_int; rd_dat_d0 <= cpu_iaddr_err_i; when "01110" => + -- Reg dram_ecc_mask + rd_ack_d0 <= rd_req_int; + rd_dat_d0(7 downto 0) <= dram_ecc_mask_reg; + rd_dat_d0(31 downto 8) <= (others => '0'); + when "01111" => -- Reg scrub_cfg rd_ack_d0 <= rd_req_int; rd_dat_d0(0) <= scrub_cfg_iram_en_i; rd_dat_d0(1) <= scrub_cfg_dram_en_i; rd_dat_d0(31 downto 2) <= (others => '0'); - when "01111" => + when "10000" => -- Reg iram_scrub_period rd_ack_d0 <= rd_req_int; rd_dat_d0(15 downto 0) <= iram_scrub_period_reg; rd_dat_d0(31 downto 16) <= (others => '0'); - when "10000" => + when "10001" => -- Reg dram_scrub_period rd_ack_d0 <= rd_req_int; rd_dat_d0(15 downto 0) <= dram_scrub_period_reg; diff --git a/sw/include/hydra_supervisor_regs.h b/sw/include/hydra_supervisor_regs.h index 287492c..0f5ae16 100644 --- a/sw/include/hydra_supervisor_regs.h +++ b/sw/include/hydra_supervisor_regs.h @@ -1,6 +1,6 @@ #ifndef __CHEBY__HYDRA_SUPERVISOR_REGS__H__ #define __CHEBY__HYDRA_SUPERVISOR_REGS__H__ -#define HYDRA_SUPERVISOR_REGS_SIZE 66 /* 0x42 */ +#define HYDRA_SUPERVISOR_REGS_SIZE 70 /* 0x46 */ /* Cause of a reset */ #define HYDRA_SUPERVISOR_REGS_RESET_CAUSE 0x0UL @@ -50,17 +50,20 @@ /* Number of CPU errors on instruction bus */ #define HYDRA_SUPERVISOR_REGS_CPU_IADDR_ERR 0x34UL +/* Change ECC bits on dram writes for testing */ +#define HYDRA_SUPERVISOR_REGS_DRAM_ECC_MASK 0x38UL + /* configuration of scrubbers */ -#define HYDRA_SUPERVISOR_REGS_SCRUB_CFG 0x38UL +#define HYDRA_SUPERVISOR_REGS_SCRUB_CFG 0x3cUL #define HYDRA_SUPERVISOR_REGS_SCRUB_CFG_IRAM_EN 0x1UL #define HYDRA_SUPERVISOR_REGS_SCRUB_CFG_DRAM_EN 0x2UL /* Maximum number of cycles between a scrub */ -#define HYDRA_SUPERVISOR_REGS_IRAM_SCRUB_PERIOD 0x3cUL +#define HYDRA_SUPERVISOR_REGS_IRAM_SCRUB_PERIOD 0x40UL #define HYDRA_SUPERVISOR_REGS_IRAM_SCRUB_PERIOD_PRESET 0x800UL /* Maximum number of cycles between a scrub */ -#define HYDRA_SUPERVISOR_REGS_DRAM_SCRUB_PERIOD 0x40UL +#define HYDRA_SUPERVISOR_REGS_DRAM_SCRUB_PERIOD 0x44UL #define HYDRA_SUPERVISOR_REGS_DRAM_SCRUB_PERIOD_PRESET 0x800UL struct hydra_supervisor_regs { @@ -106,16 +109,22 @@ struct hydra_supervisor_regs { /* [0x34]: REG (ro) Number of CPU errors on instruction bus */ uint32_t cpu_iaddr_err; - /* [0x38]: REG (rw) configuration of scrubbers */ + /* [0x38]: REG (rw) Change ECC bits on dram writes for testing */ + uint8_t dram_ecc_mask; + + /* padding to: 60 words */ + uint8_t __padding_0[3]; + + /* [0x3c]: REG (rw) configuration of scrubbers */ uint32_t scrub_cfg; - /* [0x3c]: REG (rw) Maximum number of cycles between a scrub */ + /* [0x40]: REG (rw) Maximum number of cycles between a scrub */ uint16_t iram_scrub_period; - /* padding to: 64 words */ - uint8_t __padding_0[2]; + /* padding to: 68 words */ + uint8_t __padding_1[2]; - /* [0x40]: REG (rw) Maximum number of cycles between a scrub */ + /* [0x44]: REG (rw) Maximum number of cycles between a scrub */ uint16_t dram_scrub_period; }; diff --git a/sw/sf2-test/main.c b/sw/sf2-test/main.c index 86d031e..b39116e 100644 --- a/sw/sf2-test/main.c +++ b/sw/sf2-test/main.c @@ -18,6 +18,8 @@ extern void clear_bss(void); static enum t_test { + TEST_DRAM_DE, + TEST_DRAM_SE, TEST_IRAM_DE, TEST_IRAM_SE, TEST_IRAM_SCRUB, @@ -91,6 +93,8 @@ uart_puts (const char *s) uart_putc (*s++); } +static volatile unsigned ecc[4]; + #define PAD_LEN 8 static volatile unsigned pad[8]; @@ -159,17 +163,39 @@ main (void) uart_putc('\n'); switch (next_test) { + case TEST_DRAM_DE: + SUPERVISOR->dram_ecc_mask = 0x03; + ecc[0] = 12; + ecc[2] = 23; + SUPERVISOR->dram_ecc_mask = 0x01; + ecc[1] = 56; + ecc[3] = 78; + SUPERVISOR->dram_ecc_mask = 0x00; + next_test = TEST_DRAM_SE; + /* Order is important! */ + asm volatile (""); + v = ecc[0]; + unreachable(); + break; + + case TEST_DRAM_SE: + if (SUPERVISOR->dram_ecc_se != 0) + unreachable(); + v = ecc[1]; + if (SUPERVISOR->dram_ecc_se != 1) + unreachable(); + next_test = TEST_IRAM_DE; + /* Fallthrough */ + case TEST_IRAM_DE: next_test = TEST_IRAM_SE; + /* Generate a fatal error */ v = *(volatile unsigned *)0xfffc; unreachable(); break; case TEST_IRAM_SE: - /* Must have been corrected. */ - v = *(volatile unsigned *)0xfffc; - if (SUPERVISOR->iram_ecc_se != 0) - unreachable(); + /* Generate a single error */ v = *(volatile unsigned *)0xfff8; if (SUPERVISOR->iram_ecc_se != 1) unreachable(); @@ -182,13 +208,19 @@ main (void) unreachable(); uart_puts("Start IRAM scrubber"); SUPERVISOR->scrub_cfg = HYDRA_SUPERVISOR_REGS_SCRUB_CFG_IRAM_EN; + next_test = TEST_WD; while (SUPERVISOR->iram_scrub_de == 0) - uart_putc('.'); + SUPERVISOR->wd_key = WD_KEY; uart_putc('\n'); if (SUPERVISOR->iram_scrub_se != 1 - || SUPERVISOR->iram_scrub_de != 1) + || SUPERVISOR->iram_scrub_de != 2) { + uart_puts ("se:"); + uart_put_hex8 (SUPERVISOR->iram_scrub_se); + uart_puts (", de:"); + uart_put_hex8 (SUPERVISOR->iram_scrub_de); + while (1) ; unreachable(); - next_test = TEST_WD; + } /* Fallthrough */ case TEST_WD: -- GitLab