Commit 660ec751 authored by Tomasz Wlostowski's avatar Tomasz Wlostowski

100 MHz version, still room for improvement

parent 51d4addc
......@@ -18,4 +18,5 @@ files = [ "rv_cpu.v",
"rv_csr.v",
"rv_timer.v",
"rv_exceptions.v",
"urv_iram.v",
"../sim/rv_icache_model.sv"];
......@@ -50,7 +50,11 @@ endmodule
module rv_cpu
(
#(
parameter g_timer_frequency = 1000,
parameter g_clock_frequency = 100000000
)
(
input clk_i,
input rst_i,
......@@ -119,12 +123,16 @@ module rv_cpu
wire d_stall, d_kill;
wire [39:0] csr_time, csr_cycles;
wire [31:0] im_addr;
assign im_addr_o = im_addr;
rv_fetch fetch
(
.clk_i(clk_i),
.rst_i(rst_i),
.im_addr_o(im_addr_o),
.im_addr_o(im_addr),
.im_data_i(im_data_i),
.im_valid_i(im_valid_i),
......@@ -161,19 +169,22 @@ module rv_cpu
.TRIG3(TRIG3) );
-----/\----- EXCLUDED -----/\----- */
assign TRIG0 = f2d_pc;
assign TRIG1 = f2d_ir;
assign TRIG2[0] = rst_i;
assign TRIG2[1] = f2d_valid;
assign TRIG2[2] = f_kill;
assign TRIG2[3] = f_stall;
assign TRIG2[4] = w_stall_req;
assign TRIG2[5] = x_stall_req;
wire d_stall_req;
wire [31:0] d2x_alu_op1, d2x_alu_op2;
wire d2x_use_op1, d2x_use_op2;
wire d_x_rs1_bypass;
wire d_x_rs2_bypass;
wire d_w_rs1_bypass;
wire d_w_rs2_bypass;
assign TRIG0 = im_addr;
assign TRIG1 = im_data_i;
assign TRIG2[0] = im_valid_i;
rv_decode decode
(
......@@ -193,22 +204,27 @@ module rv_cpu
.rf_rs1_o(rf_rs1),
.rf_rs2_o(rf_rs2),
.d_x_rs1_bypass_i(d_x_rs1_bypass),
.d_x_rs2_bypass_i(d_x_rs2_bypass),
.d_w_rs1_bypass_i(d_w_rs1_bypass),
.d_w_rs2_bypass_i(d_w_rs2_bypass),
.x_load_hazard_o(d2x_load_hazard),
.x_valid_o(d2x_valid),
.x_pc_o(d2x_pc),
.x_rs1_o(d2x_rs1),
.x_rs2_o(d2x_rs2),
.x_imm_o(d2x_imm),
.x_rd_o(d2x_rd),
.x_shamt_o(d2x_shamt),
.x_fun_o(d2x_fun),
.x_opcode_o(d2x_opcode),
.x_shifter_sign_o(d2x_shifter_sign),
.x_imm_o(d2x_imm),
.x_is_signed_compare_o(d2x_is_signed_compare),
.x_is_signed_alu_op_o(d2x_is_signed_alu_op),
.x_is_add_o(d2x_is_add),
......@@ -220,15 +236,19 @@ module rv_cpu
.x_rd_source_o(d2x_rd_source),
.x_rd_write_o(d2x_rd_write),
.x_csr_sel_o ( d2x_csr_sel),
.x_csr_imm_o ( d2x_csr_imm),
.x_is_csr_o ( d2x_is_csr ),
.x_is_eret_o ( d2x_is_eret )
.x_csr_sel_o (d2x_csr_sel),
.x_csr_imm_o (d2x_csr_imm),
.x_is_csr_o (d2x_is_csr),
.x_is_eret_o (d2x_is_eret),
.x_alu_op1_o(d2x_alu_op1),
.x_alu_op2_o(d2x_alu_op2),
);
.x_use_op1_o(d2x_use_op1),
.x_use_op2_o(d2x_use_op2)
);
wire [4:0] x2w_rd;
wire [4:0] x2w_rd;
wire [31:0] x2w_rd_value;
wire [31:0] x2w_rd_shifter;
wire [31:0] x2w_rd_multiply;
......@@ -251,6 +271,13 @@ module rv_cpu
wire [31:0] rf_bypass_rd_value = x2w_rd_value;
wire rf_bypass_rd_write = rf_rd_write && !x2w_load; // multiply/shift too?
assign d_x_rs1_bypass = (d2x_rd == rf_rs1) && d2x_rd_write && d2x_valid;
assign d_x_rs2_bypass = (d2x_rd == rf_rs2) && d2x_rd_write && d2x_valid;
assign d_w_rs1_bypass = (x2w_rd == rf_rs1) && rf_rd_write && x2w_valid;
assign d_w_rs2_bypass = (x2w_rd == rf_rs2) && rf_rd_write && x2w_valid;
rv_regfile regfile
(
......@@ -277,7 +304,6 @@ module rv_cpu
);
rv_exec execute
......@@ -311,6 +337,13 @@ module rv_cpu
.d_is_load_i(d2x_is_load),
.d_is_store_i(d2x_is_store),
.d_is_undef_i(d2x_is_undef),
.d_alu_op1_i(d2x_alu_op1),
.d_alu_op2_i(d2x_alu_op2),
.d_use_op1_i(d2x_use_op1),
.d_use_op2_i(d2x_use_op2),
.d_rd_source_i(d2x_rd_source),
.d_rd_write_i(d2x_rd_write),
......@@ -355,7 +388,6 @@ module rv_cpu
wire [31:0] wb_trig2;
rv_writeback writeback
(
.clk_i(clk_i),
......@@ -406,7 +438,11 @@ module rv_cpu
assign TRIG2[16] = (stall_timeout == 63) ? 1'b1 : 1'b0;
rv_timer ctimer (
rv_timer
#(
.g_timer_frequency(g_timer_frequency),
.g_clock_frequency(g_clock_frequency)
) ctimer (
.clk_i(clk_i),
.rst_i(rst_i),
......
......@@ -96,4 +96,7 @@
`define EXCEPT_TIMER 9
`define EXCEPT_IRQ 10
`define OP_SEL_BYPASS_X 0
`define OP_SEL_BYPASS_W 1
`define OP_SEL_DIRECT 2
`define OP_SEL_IMM 3
`include "rv_defs.v"
`timescale 1ns/1ps
module rv_divide
(
input clk_i,
input rst_i,
input x_stall_i,
input x_kill_i,
output x_stall_req_o,
input d_valid_i,
input d_is_divide_i,
input [31:0] d_rs1_i,
input [31:0] d_rs2_i,
input [2:0] d_fun_i,
output reg [31:0] x_rd_o
);
reg [31:0] q,r,n,d;
reg n_sign, d_sign;
reg [5:0] state;
wire [32:0] alu_result;
reg [31:0] alu_op1;
reg [31:0] alu_op2;
reg is_rem;
wire [31:0] r_next = { r[30:0], n[31 - (state - 3)] };
always@*
case(state) // synthesis full_case parallel_case
0: begin alu_op1 <= 'hx; alu_op2 <= 'hx; end
1: begin alu_op1 <= 0; alu_op2 <= d_rs1_i; end
2: begin alu_op1 <= 0; alu_op2 <= d_rs2_i; end
35: begin alu_op1 <= 0; alu_op2 <= q; end
36: begin alu_op1 <= 0; alu_op2 <= r; end
default: begin alu_op1 <= r_next; alu_op2 <= d; end
endcase // case (state)
reg alu_sub;
assign alu_result = alu_sub ? {1'b0, alu_op1} - {1'b0, alu_op2} : {1'b0, alu_op1} + {1'b0, alu_op2};
wire alu_ge = ~alu_result [32];
wire start_divide = !x_stall_i && !x_kill_i && d_valid_i && d_is_divide_i;
wire done = (is_rem ? state == 37 : state == 36 );
assign x_stall_req_o = (start_divide || !done);
always@*
case (state) // synthesis full_case parallel_case
1:
alu_sub <= n_sign;
2:
alu_sub <= d_sign;
35:
alu_sub <= n_sign ^ d_sign;
36:
alu_sub <= n_sign;
default:
alu_sub <= 1;
endcase // case (state)
always@(posedge clk_i)
if(rst_i || done)
state <= 0;
else if (state != 0 || start_divide)
state <= state + 1;
always@(posedge clk_i)
case ( state ) // synthesis full_case parallel_case
0:
if(start_divide)
begin
q <= 0;
r <= 0;
is_rem <= (d_fun_i == `FUNC_REM || d_fun_i ==`FUNC_REMU);
n_sign <= d_rs1_i[31];
d_sign <= d_rs2_i[31];
end
1:
n <= alu_result[31:0];
2:
d <= alu_result[31:0];
35:
x_rd_o <= alu_result; // quotient
36:
x_rd_o <= alu_result; // remainder
default: // 3..34: 32 divider iterations
begin
q <= { q[30:0], alu_ge };
r <= alu_ge ? alu_result : r_next;
end
endcase // case ( state )
endmodule // rv_divide
module rv_divide_nonrestoring
(
input clk_i,
input rst_i,
input x_stall_i,
input x_kill_i,
output x_stall_req_o,
input d_valid_i,
input d_is_divide_i,
input [31:0] d_rs1_i,
input [31:0] d_rs2_i,
input [2:0] d_fun_i,
output reg [31:0] x_rd_o
);
reg [31:0] a,n,d,q;
reg n_sign, d_sign;
reg [5:0] state;
wire [32:0] alu_result;
reg [31:0] alu_op1;
reg [31:0] alu_op2;
reg is_rem;
wire [31:0] a_next = { a[30:0], 1'b0 };
always@*
case(state) // synthesis full_case parallel_case
0: begin alu_op1 <= 'hx; alu_op2 <= 'hx; end
1: begin alu_op1 <= 0; alu_op2 <= d_rs1_i; end
2: begin alu_op1 <= 0; alu_op2 <= d_rs2_i; end
35: begin alu_op1 <= a; alu_op2 <= d; end
36: begin alu_op1 <= 0; alu_op2 <= q; end
37: begin alu_op1 <= 0; alu_op2 <= a; end
default: begin alu_op1 <= a_next; alu_op2 <= d; end
endcase // case (state)
reg alu_sub;
assign alu_result = alu_sub ? {1'b0, alu_op1} - {1'b0, alu_op2} : {1'b0, alu_op1} + {1'b0, alu_op2};
wire alu_ge = ~alu_result [32];
wire start_divide = !x_stall_i && !x_kill_i && d_valid_i && d_is_divide_i;
wire done = (is_rem ? state == 38 : state == 37 );
assign x_stall_req_o = (start_divide || !done);
always@*
case (state) // synthesis full_case parallel_case
1:
alu_sub <= n_sign;
2:
alu_sub <= d_sign;
35:
alu_sub <= 0;
36:
alu_sub <= n_sign ^ d_sign;
37:
alu_sub <= n_sign;
default:
alu_sub <= ~a_next[31];
endcase // case (state)
always@(posedge clk_i)
if(rst_i || done)
state <= 0;
else if (state != 0 || start_divide)
state <= state + 1;
always@(posedge clk_i)
case ( state ) // synthesis full_case parallel_case
0:
if(start_divide)
begin
//q <= 0;
a <= 0;
is_rem <= (d_fun_i == `FUNC_REM || d_fun_i ==`FUNC_REMU);
n_sign <= d_rs1_i[31];
d_sign <= d_rs2_i[31];
end
1:
q <= alu_result[31:0];
2:
d <= alu_result[31:0];
35: // correction step
if(a[31])
a <= alu_result;
36:
x_rd_o <= alu_result; // quotient
37:
x_rd_o <= alu_result; // remainder
default: // 3..34: 32 divider iterations
begin
a <= alu_result;
q <= { q[30:0], ~alu_result[31] };
// r <= alu_ge ? alu_result : r_next;
end
endcase // case ( state )
endmodule // rv_divide_nonrestoring
......@@ -62,6 +62,12 @@ module rv_exec
input d_is_store_i,
input d_is_divide_i,
input d_is_undef_i,
input [31:0] d_alu_op1_i,
input [31:0] d_alu_op2_i,
input d_use_op1_i,
input d_use_op2_i,
input [2:0] d_rd_source_i,
......@@ -231,25 +237,10 @@ module rv_exec
// decode ALU operands
always@*
begin
case (d_opcode_i)
`OPC_LUI: alu_op1 <= d_imm_i;
`OPC_AUIPC: alu_op1 <= d_imm_i;
`OPC_JAL: alu_op1 <= 4;
`OPC_JALR: alu_op1 <= 4;
default: alu_op1 <= rs1;
endcase // case (d_opcode_i)
case (d_opcode_i)
`OPC_LUI: alu_op2 <= 0;
`OPC_AUIPC: alu_op2 <= d_pc_i;
`OPC_JAL: alu_op2 <= d_pc_i;
`OPC_JALR: alu_op2 <= d_pc_i;
`OPC_OP_IMM: alu_op2 <= d_imm_i;
default: alu_op2 <= rs2;
endcase // case (d_opcode_i)
alu_op1 <= d_use_op1_i ? d_alu_op1_i : rs1;
alu_op2 <= d_use_op2_i ? d_alu_op2_i : rs2;
end
......@@ -274,13 +265,16 @@ module rv_exec
case (d_fun_i)
`FUNC_ADD:
alu_result <= alu_addsub_result[31:0];
`FUNC_XOR: alu_result <= alu_op1 ^ alu_op2;
`FUNC_OR: alu_result <= alu_op1 | alu_op2;
`FUNC_AND: alu_result <= alu_op1 & alu_op2;
`FUNC_SLT: alu_result <= alu_addsub_result[32]?1:0;
`FUNC_SLTU: alu_result <= alu_addsub_result[32]?1:0;
`FUNC_XOR:
alu_result <= alu_op1 ^ alu_op2;
`FUNC_OR:
alu_result <= alu_op1 | alu_op2;
`FUNC_AND:
alu_result <= alu_op1 & alu_op2;
`FUNC_SLT:
alu_result <= alu_addsub_result[32]?1:0;
`FUNC_SLTU:
alu_result <= alu_addsub_result[32]?1:0;
default: alu_result <= 32'hx;
endcase // case (d_fun_i)
end // always@ *
......@@ -409,8 +403,6 @@ module rv_exec
end
endcase // case (d_fun_i)
end
//branch decision
always@*
......@@ -433,63 +425,36 @@ module rv_exec
assign dm_data_s_o = dm_data_s;
assign dm_data_select_o = dm_select_s;
/* -----\/----- EXCLUDED -----\/-----
wire is_load = (d_opcode_i == `OPC_LOAD ? 1: 0) && d_valid_i && !x_kill_i;
wire is_store = (d_opcode_i == `OPC_STORE ? 1: 0) && d_valid_i && !x_kill_i;
-----/\----- EXCLUDED -----/\----- */
assign dm_load_o = d_is_load_i & d_valid_i & !x_kill_i & !x_stall_i & !exception;
assign dm_store_o = d_is_store_i & d_valid_i & !x_kill_i & !x_stall_i & !exception;
/* -----\/----- EXCLUDED -----\/-----
wire trig_ent = (d_pc_i == 'h264 && !x_kill_i);
wire trig_ret = (d_pc_i == 'h2bc && !x_kill_i);
wire trig_wr = (dm_addr == 'hf368 && is_store && !x_stall_i);
-----/\----- EXCLUDED -----/\----- */
always@(posedge clk_i)
if (rst_i) begin
f_branch_target_o <= 0;
// f_branch_target_o <= 0;
f_branch_take <= 0;
w_rd_write_o <= 0;
// w_rd_write_o <= 0;
// w_rd_o <= 0;
w_fun_o <= 0;
// w_fun_o <= 0;
w_load_o <= 0;
w_store_o <= 0;
w_dm_addr_o <= 0;
w_rd_source_o <= 0;
// w_dm_addr_o <= 0;
// w_rd_source_o <= 0;
w_valid_o <= 0;
end else if (!x_stall_i) begin
f_branch_target_o <= branch_target;
f_branch_take <= branch_take && !x_kill_i && d_valid_i;
w_rd_o <= d_rd_i;
w_rd_value_o <= rd_value;
w_rd_write_o <= d_rd_write_i && !x_kill_i && !exception;
w_rd_write_o <= d_rd_write_i && !x_kill_i && d_valid_i && !exception;
w_rd_source_o <= d_rd_source_i;
w_fun_o <= d_fun_i;
w_load_o <= d_is_load_i & d_valid_i && !x_kill_i && !exception;
w_store_o <= d_is_store_i & d_valid_i && !x_kill_i && !exception;
/* -----\/----- EXCLUDED -----\/-----
if ( (d_is_load_i || is_store) && !exception && unaligned_addr)
begin
$error("Unaligned address!");
$stop;
end
-----/\----- EXCLUDED -----/\----- */
w_load_o <= d_is_load_i && !x_kill_i && d_valid_i && !exception;
w_store_o <= d_is_store_i && !x_kill_i && d_valid_i && !exception;
w_dm_addr_o <= dm_addr;
w_valid_o <= d_valid_i && !x_kill_i && !exception;
end else begin // if (!x_stall_i)
w_valid_o <= 0;
w_valid_o <= !exception;
end // else: !if(rst_i)
assign f_branch_take_o = f_branch_take;
......
......@@ -2,6 +2,71 @@
`timescale 1ns/1ps
module rv_mult18x18
(
input clk_i,
input rst_i,
input stall_i,
input [17:0] x_i,
input [17:0] y_i,
output [35:0] q_o
);
DSP48A1 #(
.A0REG(0),
.A1REG(0),
.B0REG(0),
.B1REG(0),
.CARRYINREG(0),
.CARRYINSEL("OPMODE5"),
.CARRYOUTREG(0),
.CREG(0),
.DREG(0),
.MREG(1),
.OPMODEREG(0),
.PREG(0),
.RSTTYPE("SYNC")
) D1 (
.BCOUT(),
.PCOUT(),
.CARRYOUT(),
.CARRYOUTF(),
.M(q_o),
.P(),
.PCIN(),
.CLK(clk_i),
.OPMODE(8'd1),
.A(x_i),
.B(y_i),
.C(48'h0),
.CARRYIN(),
.D(18'b0),
.CEA(1'b0),
.CEB(1'b0),
.CEC(1'b0),
.CECARRYIN(1'b0),
.CED(1'b0),
.CEM(~stall_i),
.CEOPMODE(1'b0),
.CEP(1'b1),
.RSTA(rst_i),
.RSTB(rst_i),
.RSTC(1'b0),
.RSTCARRYIN(1'b0),
.RSTD(1'b0),
.RSTM(rst_i),
.RSTOPMODE(1'b0),
.RSTP(1'b0)
);
endmodule // rv_mult18x18
module rv_multiply
(
input clk_i,
......@@ -15,14 +80,55 @@ module rv_multiply
output reg [31:0] w_rd_o
);
reg [31:0] yl_xl, yl_xh, yh_xl;
wire[17:0] xl = d_rs1_i[17:0];
wire[13:0] xh = d_rs1_i[31:18];
wire[17:0] yl = d_rs2_i[17:0];
wire[13:0] yh = d_rs2_i[31:18];
wire[17:0] xl_u = {1'b0, d_rs1_i[16:0] };
wire[17:0] yl_u = {1'b0, d_rs2_i[16:0] };
wire[17:0] xl_s = {d_rs1_i[16], d_rs1_i[16:0] };
wire[17:0] yl_s = {d_rs2_i[16], d_rs2_i[16:0] };
wire[17:0] xh = { {3{d_rs1_i[31]}}, d_rs1_i[31:17] };
wire[17:0] yh = { {3{d_rs2_i[31]}}, d_rs2_i[31:17] };
wire [35:0] yl_xl, yl_xh, yh_xl;
rv_mult18x18 U_mul0
(
.clk_i(clk_i),
.rst_i(rst_i),
.stall_i(x_stall_i),
.x_i(xl_u),
.y_i(yl_u),
.q_o(yl_xl)
);
rv_mult18x18 U_mul1
(
.clk_i(clk_i),
.rst_i(rst_i),
.stall_i(x_stall_i),
.x_i(xl_s),
.y_i(yh),
.q_o(yh_xl)
);
rv_mult18x18 U_mul2
(
.clk_i(clk_i),
.rst_i(rst_i),
.stall_i(x_stall_i),
.x_i(yl_s),
.y_i(xh),
.q_o(yl_xh)
);
/* -----\/----- EXCLUDED -----\/-----
always@(posedge clk_i)
......@@ -36,8 +142,9 @@ module rv_multiply
// stage0 <= $signed(d_rs1_i) * $signed(d_rs2_i);
*/
always@*
w_rd_o <= yl_xl + {yl_xh[13:0], 18'h0} + {yh_xl[13:0], 18'h0};
w_rd_o <= yl_xl + {yl_xh[14:0], 17'h0} + {yh_xl[14:0], 17'h0};
......
......@@ -42,9 +42,14 @@ module rv_decode
output x_valid_o,
output reg [31:0] x_pc_o,
input d_x_rs1_bypass_i,
input d_x_rs2_bypass_i,
input d_w_rs1_bypass_i,
input d_w_rs2_bypass_i,