From 71cf6b89c23c543bf661c2a8c90d937b5121cdc6 Mon Sep 17 00:00:00 2001
From: Tristan Gingold <tristan.gingold@cern.ch>
Date: Fri, 6 May 2022 14:57:49 +0200
Subject: [PATCH] Add iram ecc fatal address, distinguish ecc fatal errors

---
 hdl/rtl/hydra_core.vhd              | 22 +++++++++----
 hdl/rtl/hydra_dram.vhd              |  4 +--
 hdl/rtl/hydra_iram.vhd              |  7 +++-
 hdl/rtl/hydra_supervisor_regs.cheby | 19 ++++++++---
 hdl/rtl/hydra_supervisor_regs.vhd   | 51 +++++++++++++++++++----------
 sw/include/hydra_supervisor_regs.h  | 31 +++++++++++-------
 sw/sf2-test/main.c                  | 32 ++++++++++++++----
 7 files changed, 116 insertions(+), 50 deletions(-)

diff --git a/hdl/rtl/hydra_core.vhd b/hdl/rtl/hydra_core.vhd
index 3000c8b..d2d489f 100644
--- a/hdl/rtl/hydra_core.vhd
+++ b/hdl/rtl/hydra_core.vhd
@@ -134,8 +134,10 @@ architecture arch of hydra_core is
   signal scrub_cfg_wr, scrub_cfg_iram_val, scrub_cfg_dram_val : std_logic;
   signal nbr_cpu_data_err : std_logic_vector (31 downto 0);
   signal nbr_cpu_iaddr_err : std_logic_vector (31 downto 0);
+  signal iram_de_addr : std_logic_vector (31 downto 0) := (others => '0');
 
-  signal reset_cause_cpu, reset_cause_ecc, reset_cause_wd : std_logic;
+  signal reset_cause_cpu, reset_cause_wd : std_logic;
+  signal reset_cause_iram_ecc, reset_cause_dram_ecc : std_logic;
   signal dram_scrub_period, iram_scrub_period : std_logic_vector(15 downto 0);
 
   --  Initial watchdog period.
@@ -155,7 +157,6 @@ begin
   dwb_o <= dwb_out;
 
   cpu_rst2 <= "111" when rst_n_i = '0' or cpu_rst_n_i = '0' else cpu_rst;
-  cpu_err_o <= '1' when cpu_rst /= "111" and cpu_rst /= "000" else '0';
       
   inst_cpus : entity work.hydra_triple_cpu
     port map (
@@ -234,6 +235,7 @@ begin
 
       ecc_one_o => iram_ecc_err,
       ecc_fatal_o => iram_ecc_fatal,
+      ecc_fatal_addr_o => iram_de_addr(g_IRAM_LOG_SIZE - 1 downto 2),
       scrub_se_o => iram_scrub_se,
       scrub_de_o => iram_scrub_de,
       scrubber_period_i => iram_scrub_period
@@ -503,7 +505,8 @@ begin
       wb_i => sv_wb_out,
       wb_o => sv_wb_in,
       reset_cause_cpu_i => reset_cause_cpu,
-      reset_cause_ecc_i => reset_cause_ecc,
+      reset_cause_iram_ecc_i => reset_cause_iram_ecc,
+      reset_cause_dram_ecc_i => reset_cause_dram_ecc,
       reset_cause_watchdog_i => reset_cause_wd,
       cpu_reset_i => cpu_rst,
       cpu_reset_o => open,
@@ -527,6 +530,7 @@ begin
       cpu_data_err_i => nbr_cpu_data_err,
       cpu_iaddr_err_i => nbr_cpu_iaddr_err,
       dram_ecc_mask_o => dram_ecc_mask,
+      iram_de_addr_i => iram_de_addr,
       scrub_cfg_iram_en_i => iram_scrub_en,
       scrub_cfg_iram_en_o => scrub_cfg_iram_val,
       scrub_cfg_dram_en_i => dram_scrub_en,
@@ -539,6 +543,8 @@ begin
   process (clk_sys_i) is
   begin
     if rising_edge(clk_sys_i) then
+      cpu_err_o <= '0';
+
       if rst_n_i = '0' or cpu_rst_n_i = '0' then
         state <= S_VOTER;
         cpu_rst <= (others => '1');
@@ -546,7 +552,8 @@ begin
         nbr_cpu_data_err <= (others => '0');
         nbr_cpu_iaddr_err <= (others => '0');
         reset_cause_cpu <= '0';
-        reset_cause_ecc <= '0';
+        reset_cause_iram_ecc <= '0';
+        reset_cause_dram_ecc <= '0';
         reset_cause_wd <= '0';
       else
         case state is
@@ -578,15 +585,18 @@ begin
             end if;
         end case;
 
-        if cpu_sync = "000" or cpu_rst_err = '1'
+        if cpu_sync = "000"
+          or cpu_rst_err = '1'
           or (err_cpu_dm = '1' and state = S_LOCK)
           or (cpu_wr = '1' and cpu_recovery_in = '1')
         then
           --  Fatal error
           cpu_rst <= (others => '1');
           reset_cause_cpu <= not cpu_rst_err;
-          reset_cause_ecc <= dram_ecc_fatal or iram_ecc_fatal;
+          reset_cause_iram_ecc <= iram_ecc_fatal;
+          reset_cause_dram_ecc <= dram_ecc_fatal;
           reset_cause_wd <= wd_timeout;
+          cpu_err_o <= '1';
           state <= S_VOTER;
         end if;
       end if;
diff --git a/hdl/rtl/hydra_dram.vhd b/hdl/rtl/hydra_dram.vhd
index 405f08d..4b52bef 100644
--- a/hdl/rtl/hydra_dram.vhd
+++ b/hdl/rtl/hydra_dram.vhd
@@ -160,9 +160,9 @@ begin
 
   p_ctrl: process (state,
                    p_done, r_done, scrub_done, rerr,
-                   rerr_one, rdata_ecc, rsyndrome, last_addr, last_we, last_sel, recc,
+                   rerr_one, rdata_ecc, rsyndrome, last_addr, last_we, last_sel, eccm_i,
                    en_i, addr_i, we_i, data_i, sel_i, r_done_d,
-                   scrub_rd, scrub_addr, scrub_done_d)
+                   scrub_rd, scrub_addr, scrub_done_d, scrub_en_i)
   is
     variable d : std_logic_vector(31 downto 0);
   begin
diff --git a/hdl/rtl/hydra_iram.vhd b/hdl/rtl/hydra_iram.vhd
index 8092620..edf6c25 100644
--- a/hdl/rtl/hydra_iram.vhd
+++ b/hdl/rtl/hydra_iram.vhd
@@ -68,6 +68,7 @@ entity hydra_iram is
     --  For statistics
     ecc_one_o    : out std_logic;
     ecc_fatal_o  : out std_logic;
+    ecc_fatal_addr_o : out std_logic_vector(g_RAM_LOG_SIZE - 1 downto 2);
     scrub_se_o  : out std_logic;
     scrub_de_o  : out std_logic;
 
@@ -172,7 +173,7 @@ begin
     end if;
   end process;
 
-  p_ctrl: process (state, r1_done, r2_done, rerr, rerr_one, rdata_ecc, rsyndrome, last_raddr, recc,
+  p_ctrl: process (state, r1_done, r2_done, rerr, rerr_one, rdata_ecc, rsyndrome, last_raddr,
                    r1_en_i, r1_addr_i, we_i, waddr_i, wdata_i, wecc_i, r1_done_d, r2_done_d,
                    r2_en_i, r2_addr_i, scrub_rd, scrub_addr, scrub_done, scrub_en_i)
   begin
@@ -273,6 +274,7 @@ begin
         ecc_fatal_o <= '0';
         scrub_se_o <= '0';
         scrub_de_o <= '0';
+        ecc_fatal_addr_o <= (others => '0');
       else
         r1_done_d <= r1_done;
         r1_done <= n_r1_done;
@@ -285,6 +287,9 @@ begin
 
         ecc_one_o <= n_ecc_one;
         ecc_fatal_o <= n_ecc_fatal;
+        if n_ecc_fatal = '1' then
+          ecc_fatal_addr_o <= n_last_raddr;
+        end if;
         scrub_se_o <= n_scrub_se;
         scrub_de_o <= n_scrub_de;
       end if;
diff --git a/hdl/rtl/hydra_supervisor_regs.cheby b/hdl/rtl/hydra_supervisor_regs.cheby
index 857d890..c87b88c 100644
--- a/hdl/rtl/hydra_supervisor_regs.cheby
+++ b/hdl/rtl/hydra_supervisor_regs.cheby
@@ -16,13 +16,17 @@ memory-map:
           description: CPU divergence
           range: 0
       - field:
-          name: ecc
-          description: Uncorrectable ECC error
+          name: iram_ecc
+          description: Uncorrectable ECC error in IRAM
           range: 1
+      - field:
+          name: dram_ecc
+          description: Uncorrectable ECC error in DRAM
+          range: 2
       - field:
           name: watchdog
           description: Watchdog timeout
-          range: 2
+          range: 3
   - reg:
       name: cpu
       description: state of cpus
@@ -112,6 +116,11 @@ memory-map:
       description: Change ECC bits on dram writes for testing
       width: 8
       access: rw
+  - reg:
+      name: iram_de_addr
+      description: Address of the last fatal error
+      width: 32
+      access: ro
   - reg:
       name: scrub_cfg
       description: configuration of scrubbers
@@ -134,11 +143,11 @@ memory-map:
       description: Maximum number of cycles between a scrub
       width: 16
       access: rw
-      preset: 2048
+      preset: 2047
   - reg:
       name: dram_scrub_period
       description: Maximum number of cycles between a scrub
       width: 16
       access: rw
-      preset: 2048
+      preset: 2047
   
diff --git a/hdl/rtl/hydra_supervisor_regs.vhd b/hdl/rtl/hydra_supervisor_regs.vhd
index 8f917a2..859e95c 100644
--- a/hdl/rtl/hydra_supervisor_regs.vhd
+++ b/hdl/rtl/hydra_supervisor_regs.vhd
@@ -17,8 +17,10 @@ entity hydra_supervisor_regs is
     -- Cause of a reset
     -- CPU divergence
     reset_cause_cpu_i    : in    std_logic;
-    -- Uncorrectable ECC error
-    reset_cause_ecc_i    : in    std_logic;
+    -- Uncorrectable ECC error in IRAM
+    reset_cause_iram_ecc_i : in    std_logic;
+    -- Uncorrectable ECC error in DRAM
+    reset_cause_dram_ecc_i : in    std_logic;
     -- Watchdog timeout
     reset_cause_watchdog_i : in    std_logic;
 
@@ -74,6 +76,9 @@ entity hydra_supervisor_regs is
     -- Change ECC bits on dram writes for testing
     dram_ecc_mask_o      : out   std_logic_vector(7 downto 0);
 
+    -- Address of the last fatal error
+    iram_de_addr_i       : in    std_logic_vector(31 downto 0);
+
     -- configuration of scrubbers
     -- Write 1 to enable iram scrubber
     scrub_cfg_iram_en_i  : in    std_logic;
@@ -235,6 +240,8 @@ begin
     end if;
   end process;
 
+  -- Register iram_de_addr
+
   -- Register scrub_cfg
   scrub_cfg_iram_en_o <= wr_dat_d0(0);
   scrub_cfg_dram_en_o <= wr_dat_d0(1);
@@ -245,7 +252,7 @@ begin
   process (clk_i) begin
     if rising_edge(clk_i) then
       if rst_n_i = '0' then
-        iram_scrub_period_reg <= "0000100000000000";
+        iram_scrub_period_reg <= "0000011111111111";
         iram_scrub_period_wack <= '0';
       else
         if iram_scrub_period_wreq = '1' then
@@ -261,7 +268,7 @@ begin
   process (clk_i) begin
     if rising_edge(clk_i) then
       if rst_n_i = '0' then
-        dram_scrub_period_reg <= "0000100000000000";
+        dram_scrub_period_reg <= "0000011111111111";
         dram_scrub_period_wack <= '0';
       else
         if dram_scrub_period_wreq = '1' then
@@ -333,14 +340,17 @@ begin
       dram_ecc_mask_wreq <= wr_req_d0;
       wr_ack_int <= dram_ecc_mask_wack;
     when "01111" =>
+      -- Reg iram_de_addr
+      wr_ack_int <= wr_req_d0;
+    when "10000" =>
       -- Reg scrub_cfg
       scrub_cfg_wreq <= wr_req_d0;
       wr_ack_int <= wr_req_d0;
-    when "10000" =>
+    when "10001" =>
       -- Reg iram_scrub_period
       iram_scrub_period_wreq <= wr_req_d0;
       wr_ack_int <= iram_scrub_period_wack;
-    when "10001" =>
+    when "10010" =>
       -- Reg dram_scrub_period
       dram_scrub_period_wreq <= wr_req_d0;
       wr_ack_int <= dram_scrub_period_wack;
@@ -350,13 +360,13 @@ begin
   end process;
 
   -- Process for read requests.
-  process (adr_int, rd_req_int, reset_cause_cpu_i, reset_cause_ecc_i,
-           reset_cause_watchdog_i, cpu_reset_i, cpu_recovery_i,
-           force_divergence_i, wd_period_i, wd_count_i, iram_ecc_se_i,
-           iram_scrub_se_i, iram_scrub_de_i, dram_ecc_se_i, dram_scrub_se_i,
-           dram_scrub_de_i, cpu_data_err_i, cpu_iaddr_err_i, dram_ecc_mask_reg,
-           scrub_cfg_iram_en_i, scrub_cfg_dram_en_i, iram_scrub_period_reg,
-           dram_scrub_period_reg) begin
+  process (adr_int, rd_req_int, reset_cause_cpu_i, reset_cause_iram_ecc_i,
+           reset_cause_dram_ecc_i, reset_cause_watchdog_i, cpu_reset_i,
+           cpu_recovery_i, force_divergence_i, wd_period_i, wd_count_i,
+           iram_ecc_se_i, iram_scrub_se_i, iram_scrub_de_i, dram_ecc_se_i,
+           dram_scrub_se_i, dram_scrub_de_i, cpu_data_err_i, cpu_iaddr_err_i,
+           dram_ecc_mask_reg, iram_de_addr_i, scrub_cfg_iram_en_i,
+           scrub_cfg_dram_en_i, iram_scrub_period_reg, dram_scrub_period_reg) begin
     -- By default ack read requests
     rd_dat_d0 <= (others => 'X');
     force_divergence_rd_o <= '0';
@@ -365,9 +375,10 @@ begin
       -- Reg reset_cause
       rd_ack_d0 <= rd_req_int;
       rd_dat_d0(0) <= reset_cause_cpu_i;
-      rd_dat_d0(1) <= reset_cause_ecc_i;
-      rd_dat_d0(2) <= reset_cause_watchdog_i;
-      rd_dat_d0(31 downto 3) <= (others => '0');
+      rd_dat_d0(1) <= reset_cause_iram_ecc_i;
+      rd_dat_d0(2) <= reset_cause_dram_ecc_i;
+      rd_dat_d0(3) <= reset_cause_watchdog_i;
+      rd_dat_d0(31 downto 4) <= (others => '0');
     when "00001" =>
       -- Reg cpu
       rd_ack_d0 <= rd_req_int;
@@ -429,17 +440,21 @@ begin
       rd_dat_d0(7 downto 0) <= dram_ecc_mask_reg;
       rd_dat_d0(31 downto 8) <= (others => '0');
     when "01111" =>
+      -- Reg iram_de_addr
+      rd_ack_d0 <= rd_req_int;
+      rd_dat_d0 <= iram_de_addr_i;
+    when "10000" =>
       -- Reg scrub_cfg
       rd_ack_d0 <= rd_req_int;
       rd_dat_d0(0) <= scrub_cfg_iram_en_i;
       rd_dat_d0(1) <= scrub_cfg_dram_en_i;
       rd_dat_d0(31 downto 2) <= (others => '0');
-    when "10000" =>
+    when "10001" =>
       -- Reg iram_scrub_period
       rd_ack_d0 <= rd_req_int;
       rd_dat_d0(15 downto 0) <= iram_scrub_period_reg;
       rd_dat_d0(31 downto 16) <= (others => '0');
-    when "10001" =>
+    when "10010" =>
       -- Reg dram_scrub_period
       rd_ack_d0 <= rd_req_int;
       rd_dat_d0(15 downto 0) <= dram_scrub_period_reg;
diff --git a/sw/include/hydra_supervisor_regs.h b/sw/include/hydra_supervisor_regs.h
index 0f5ae16..a92d887 100644
--- a/sw/include/hydra_supervisor_regs.h
+++ b/sw/include/hydra_supervisor_regs.h
@@ -1,12 +1,13 @@
 #ifndef __CHEBY__HYDRA_SUPERVISOR_REGS__H__
 #define __CHEBY__HYDRA_SUPERVISOR_REGS__H__
-#define HYDRA_SUPERVISOR_REGS_SIZE 70 /* 0x46 */
+#define HYDRA_SUPERVISOR_REGS_SIZE 74 /* 0x4a */
 
 /* Cause of a reset */
 #define HYDRA_SUPERVISOR_REGS_RESET_CAUSE 0x0UL
 #define HYDRA_SUPERVISOR_REGS_RESET_CAUSE_CPU 0x1UL
-#define HYDRA_SUPERVISOR_REGS_RESET_CAUSE_ECC 0x2UL
-#define HYDRA_SUPERVISOR_REGS_RESET_CAUSE_WATCHDOG 0x4UL
+#define HYDRA_SUPERVISOR_REGS_RESET_CAUSE_IRAM_ECC 0x2UL
+#define HYDRA_SUPERVISOR_REGS_RESET_CAUSE_DRAM_ECC 0x4UL
+#define HYDRA_SUPERVISOR_REGS_RESET_CAUSE_WATCHDOG 0x8UL
 
 /* state of cpus */
 #define HYDRA_SUPERVISOR_REGS_CPU 0x4UL
@@ -53,18 +54,21 @@
 /* Change ECC bits on dram writes for testing */
 #define HYDRA_SUPERVISOR_REGS_DRAM_ECC_MASK 0x38UL
 
+/* Address of the last fatal error */
+#define HYDRA_SUPERVISOR_REGS_IRAM_DE_ADDR 0x3cUL
+
 /* configuration of scrubbers */
-#define HYDRA_SUPERVISOR_REGS_SCRUB_CFG 0x3cUL
+#define HYDRA_SUPERVISOR_REGS_SCRUB_CFG 0x40UL
 #define HYDRA_SUPERVISOR_REGS_SCRUB_CFG_IRAM_EN 0x1UL
 #define HYDRA_SUPERVISOR_REGS_SCRUB_CFG_DRAM_EN 0x2UL
 
 /* Maximum number of cycles between a scrub */
-#define HYDRA_SUPERVISOR_REGS_IRAM_SCRUB_PERIOD 0x40UL
-#define HYDRA_SUPERVISOR_REGS_IRAM_SCRUB_PERIOD_PRESET 0x800UL
+#define HYDRA_SUPERVISOR_REGS_IRAM_SCRUB_PERIOD 0x44UL
+#define HYDRA_SUPERVISOR_REGS_IRAM_SCRUB_PERIOD_PRESET 0x7ffUL
 
 /* Maximum number of cycles between a scrub */
-#define HYDRA_SUPERVISOR_REGS_DRAM_SCRUB_PERIOD 0x44UL
-#define HYDRA_SUPERVISOR_REGS_DRAM_SCRUB_PERIOD_PRESET 0x800UL
+#define HYDRA_SUPERVISOR_REGS_DRAM_SCRUB_PERIOD 0x48UL
+#define HYDRA_SUPERVISOR_REGS_DRAM_SCRUB_PERIOD_PRESET 0x7ffUL
 
 struct hydra_supervisor_regs {
   /* [0x0]: REG (ro) Cause of a reset */
@@ -115,16 +119,19 @@ struct hydra_supervisor_regs {
   /* padding to: 60 words */
   uint8_t __padding_0[3];
 
-  /* [0x3c]: REG (rw) configuration of scrubbers */
+  /* [0x3c]: REG (ro) Address of the last fatal error */
+  uint32_t iram_de_addr;
+
+  /* [0x40]: REG (rw) configuration of scrubbers */
   uint32_t scrub_cfg;
 
-  /* [0x40]: REG (rw) Maximum number of cycles between a scrub */
+  /* [0x44]: REG (rw) Maximum number of cycles between a scrub */
   uint16_t iram_scrub_period;
 
-  /* padding to: 68 words */
+  /* padding to: 72 words */
   uint8_t __padding_1[2];
 
-  /* [0x44]: REG (rw) Maximum number of cycles between a scrub */
+  /* [0x48]: REG (rw) Maximum number of cycles between a scrub */
   uint16_t dram_scrub_period;
 };
 
diff --git a/sw/sf2-test/main.c b/sw/sf2-test/main.c
index b39116e..b3e6262 100644
--- a/sw/sf2-test/main.c
+++ b/sw/sf2-test/main.c
@@ -53,7 +53,7 @@ static void
 uart_raw_putc (unsigned char c)
 {
   /* Wait until TEMT (transmit empty) is set.  */
-  /* NB: bit 5 for RX not empty */
+  /* NB: bit 5 for TX not empty */
   do {
     SUPERVISOR->wd_key = WD_KEY;
   } while (!(*(volatile unsigned*)UART_LSR & 0x20));
@@ -61,6 +61,17 @@ uart_raw_putc (unsigned char c)
   *(volatile unsigned *)UART_TX = c;
 }
 
+static unsigned char
+uart_raw_getc (void)
+{
+  /* Wait until DR (data ready, bit 0) is set.  */
+  do {
+    SUPERVISOR->wd_key = WD_KEY;
+  } while (!(*(volatile unsigned*)UART_LSR & 0x1));
+
+  return *(volatile unsigned *)UART_RX;
+}
+
 static void
 uart_putc (char c)
 {
@@ -162,6 +173,13 @@ main (void)
   uart_put_hex_digit (next_test & 0x0f);
   uart_putc('\n');
 
+#if 0
+  uart_puts("Go?");
+  while (uart_raw_getc() != 'y')
+    ;
+  uart_putc('\n');
+#endif
+
   switch (next_test) {
   case TEST_DRAM_DE:
     SUPERVISOR->dram_ecc_mask = 0x03;
@@ -209,16 +227,18 @@ main (void)
     uart_puts("Start IRAM scrubber");
     SUPERVISOR->scrub_cfg = HYDRA_SUPERVISOR_REGS_SCRUB_CFG_IRAM_EN;
     next_test = TEST_WD;
-    while (SUPERVISOR->iram_scrub_de == 0)
+    do {
+      v = SUPERVISOR->iram_scrub_de;
       SUPERVISOR->wd_key = WD_KEY;
+    } while (v == 0);
+    /* Because the scrubber is always running, the counter is always
+       incrementing.  So the original value is kept. */
     uart_putc('\n');
-    if (SUPERVISOR->iram_scrub_se != 1
-        || SUPERVISOR->iram_scrub_de != 2) {
+    if (SUPERVISOR->iram_scrub_se != 1 || v > 2) {
       uart_puts ("se:");
       uart_put_hex8 (SUPERVISOR->iram_scrub_se);
       uart_puts (", de:");
-      uart_put_hex8 (SUPERVISOR->iram_scrub_de);
-      while (1) ;
+      uart_put_hex8 (v);
       unreachable();
     }
     /* Fallthrough */
-- 
GitLab