[PATCH v3 08/12] riscv: p8700: Add software emulation for AMO* instructions

Uros Stajic uros.stajic at htecgroup.com
Tue Jul 29 18:24:17 CEST 2025


From: Chao-ying Fu <cfu at mips.com>

This patch adds software emulation for atomic memory operations (AMO)
instructions that may not be supported in hardware.

The `emu-amo.s` file provides assembly implementations of the
aforementioned operations. Corresponding handler logic is integrated
into the illegal instruction trap to catch and emulate unsupported
AMO* instructions at runtime.

Signed-off-by: Chao-ying Fu <cfu at mips.com>
Signed-off-by: Uros Stajic <uros.stajic at htecgroup.com>
---
 arch/riscv/cpu/p8700/Makefile  |   1 +
 arch/riscv/cpu/p8700/emu-amo.S | 254 ++++++++++++++++++++++++++++
 arch/riscv/lib/interrupts.c    | 299 +++++++++++++++++++++++++++++++++
 include/interrupt.h            |  19 +++
 4 files changed, 573 insertions(+)
 create mode 100644 arch/riscv/cpu/p8700/emu-amo.S

diff --git a/arch/riscv/cpu/p8700/Makefile b/arch/riscv/cpu/p8700/Makefile
index 4dfbddc5cba..22f96401640 100644
--- a/arch/riscv/cpu/p8700/Makefile
+++ b/arch/riscv/cpu/p8700/Makefile
@@ -5,5 +5,6 @@
 obj-y += cache.o
 obj-y += cpu.o
 obj-y += dram.o
+obj-y += emu-amo.o
 
 obj-$(CONFIG_P8700_RISCV) += p8700_platform_setup.o
diff --git a/arch/riscv/cpu/p8700/emu-amo.S b/arch/riscv/cpu/p8700/emu-amo.S
new file mode 100644
index 00000000000..b7005339939
--- /dev/null
+++ b/arch/riscv/cpu/p8700/emu-amo.S
@@ -0,0 +1,254 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (C) 2021, Chao-ying Fu <cfu at mips.com>
+ */
+
+        .text
+
+        .align 3
+	.globl	atomic_swap_w
+atomic_swap_w:
+	lw	a5,0(a0)
+	mv	a4,a0
+2:	mv	a0,a5
+1:	lr.w	a5,(a4)
+	bne	a5,a0,2b
+	sc.w	a6,a1,(a4)
+	bnez	a6,1b
+	ret
+
+        .align 3
+	.globl	atomic_swap_d
+atomic_swap_d:
+	ld	a5,0(a0)
+	mv	a4,a0
+2:	mv	a0,a5
+1:	lr.d	a5,(a4)
+	bne	a5,a0,2b
+	sc.d	a6,a1,(a4)
+	bnez	a6,1b
+	ret
+
+        .align 3
+	.globl	atomic_add_w
+atomic_add_w:
+	lw	a5,0(a0)
+	mv	a4,a0
+2:	mv	a0,a5
+	addw	a3,a5,a1
+1:	lr.w	a5,(a4)
+	bne	a5,a0,2b
+	sc.w	a6,a3,(a4)
+	bnez	a6,1b
+	ret
+
+        .align 3
+	.globl	atomic_add_d
+atomic_add_d:
+	ld	a5,0(a0)
+	mv	a4,a0
+2:	mv	a0,a5
+	add	a3,a5,a1
+1:	lr.d	a5,(a4)
+	bne	a5,a0,2b
+	sc.d	a6,a3,(a4)
+	bnez	a6,1b
+	ret
+
+        .align 3
+	.globl	atomic_and_w
+atomic_and_w:
+	lw	a5,0(a0)
+	mv	a4,a0
+2:	mv	a0,a5
+	and	a3,a5,a1
+1:	lr.w	a5,(a4)
+	bne	a5,a0,2b
+	sc.w	a6,a3,(a4)
+	bnez	a6,1b
+	ret
+
+        .align 3
+	.globl	atomic_and_d
+atomic_and_d:
+	ld	a5,0(a0)
+	mv	a4,a0
+2:	mv	a0,a5
+	and	a3,a5,a1
+1:	lr.d	a5,(a4)
+	bne	a5,a0,2b
+	sc.d	a6,a3,(a4)
+	bnez	a6,1b
+	ret
+
+        .align 3
+	.globl	atomic_or_w
+atomic_or_w:
+	lw	a5,0(a0)
+	mv	a4,a0
+2:	mv	a0,a5
+	or	a3,a5,a1
+1:	lr.w	a5,(a4)
+	bne	a5,a0,2b
+	sc.w	a6,a3,(a4)
+	bnez	a6,1b
+	ret
+
+        .align 3
+	.globl	atomic_or_d
+atomic_or_d:
+	ld	a5,0(a0)
+	mv	a4,a0
+2:	mv	a0,a5
+	or	a3,a5,a1
+1:	lr.d	a5,(a4)
+	bne	a5,a0,2b
+	sc.d	a6,a3,(a4)
+	bnez	a6,1b
+	ret
+
+        .align 3
+	.globl	atomic_xor_w
+atomic_xor_w:
+	lw	a5,0(a0)
+	mv	a4,a0
+2:	mv	a0,a5
+	xor	a3,a5,a1
+1:	lr.w	a5,(a4)
+	bne	a5,a0,2b
+	sc.w	a6,a3,(a4)
+	bnez	a6,1b
+	ret
+
+        .align 3
+	.globl	atomic_xor_d
+atomic_xor_d:
+	ld	a5,0(a0)
+	mv	a4,a0
+2:	mv	a0,a5
+	xor	a3,a5,a1
+1:	lr.d	a5,(a4)
+	bne	a5,a0,2b
+	sc.d	a6,a3,(a4)
+	bnez	a6,1b
+	ret
+
+        .align 3
+	.globl	atomic_max_w
+atomic_max_w:
+	lw	a5,0(a0)
+	mv	a4,a0
+2:	mv	a0,a5
+	mv	a3,a5
+	bge	a5,a1,1f
+	mv	a3,a1
+1:	lr.w	a5,(a4)
+	bne	a5,a0,2b
+	sc.w	a6,a3,(a4)
+	bnez	a6,1b
+	ret
+
+        .align 3
+	.globl	atomic_max_d
+atomic_max_d:
+	ld	a5,0(a0)
+	mv	a4,a0
+2:	mv	a0,a5
+	mv	a3,a5
+	bge	a5,a1,1f
+	mv	a3,a1
+1:	lr.d	a5,(a4)
+	bne	a5,a0,2b
+	sc.d	a6,a3,(a4)
+	bnez	a6,1b
+	ret
+
+        .align 3
+	.globl	atomic_maxu_w
+atomic_maxu_w:
+	lw	a5,0(a0)
+	mv	a4,a0
+2:	mv	a0,a5
+	mv	a3,a5
+	bgeu	a5,a1,1f
+	mv	a3,a1
+1:	lr.w	a5,(a4)
+	bne	a5,a0,2b
+	sc.w	a6,a3,(a4)
+	bnez	a6,1b
+	ret
+
+        .align 3
+	.globl	atomic_maxu_d
+atomic_maxu_d:
+	ld	a5,0(a0)
+	mv	a4,a0
+2:	mv	a0,a5
+	mv	a3,a5
+	bgeu	a5,a1,1f
+	mv	a3,a1
+1:	lr.d	a5,(a4)
+	bne	a5,a0,2b
+	sc.d	a6,a3,(a4)
+	bnez	a6,1b
+	ret
+
+        .align 3
+	.globl	atomic_min_w
+atomic_min_w:
+	lw	a5,0(a0)
+	mv	a4,a0
+2:	mv	a0,a5
+	mv	a3,a5
+	bge	a1,a5,1f
+	mv	a3,a1
+1:	lr.w	a5,(a4)
+	bne	a5,a0,2b
+	sc.w	a6,a3,(a4)
+	bnez	a6,1b
+	ret
+
+        .align 3
+	.globl	atomic_min_d
+atomic_min_d:
+	ld	a5,0(a0)
+	mv	a4,a0
+2:	mv	a0,a5
+	mv	a3,a5
+	bge	a1,a5,1f
+	mv	a3,a1
+1:	lr.d	a5,(a4)
+	bne	a5,a0,2b
+	sc.d	a6,a3,(a4)
+	bnez	a6,1b
+	ret
+
+        .align 3
+	.globl	atomic_minu_w
+atomic_minu_w:
+	lw	a5,0(a0)
+	mv	a4,a0
+2:	mv	a0,a5
+	mv	a3,a5
+	bgeu	a1,a5,1f
+	mv	a3,a1
+1:	lr.w	a5,(a4)
+	bne	a5,a0,2b
+	sc.w	a6,a3,(a4)
+	bnez	a6,1b
+	ret
+
+        .align 3
+	.globl	atomic_minu_d
+atomic_minu_d:
+	ld	a5,0(a0)
+	mv	a4,a0
+2:	mv	a0,a5
+	mv	a3,a5
+	bgeu	a1,a5,1f
+	mv	a3,a1
+1:	lr.d	a5,(a4)
+	bne	a5,a0,2b
+	sc.d	a6,a3,(a4)
+	bnez	a6,1b
+	ret
diff --git a/arch/riscv/lib/interrupts.c b/arch/riscv/lib/interrupts.c
index ef1056eeb6f..906916c762f 100644
--- a/arch/riscv/lib/interrupts.c
+++ b/arch/riscv/lib/interrupts.c
@@ -22,6 +22,36 @@
 
 DECLARE_GLOBAL_DATA_PTR;
 
+#define ILLEGAL_INSTRUCTION 2
+#define AMO_MASK 0xf800707f
+#define AQRL_MASK 0x06000000
+#define AQRL_SHIFT 25
+#define RS2_MASK 0x01f00000
+#define RS2_SHIFT 20
+#define RS1_MASK 0x000f8000
+#define RS1_SHIFT 15
+#define RD_MASK 0x00000f80
+#define RD_SHIFT 7
+
+#define AMOADD_D_MATCH 0x0000302f
+#define AMOADD_W_MATCH 0x0000202f
+#define AMOAND_D_MATCH 0x6000302f
+#define AMOAND_W_MATCH 0x6000202f
+#define AMOMAX_D_MATCH 0xa000302f
+#define AMOMAX_W_MATCH 0xa000202f
+#define AMOMAXU_D_MATCH 0xe000302f
+#define AMOMAXU_W_MATCH 0xe000202f
+#define AMOMIN_D_MATCH 0x8000302f
+#define AMOMIN_W_MATCH 0x8000202f
+#define AMOMINU_D_MATCH 0xc000302f
+#define AMOMINU_W_MATCH 0xc000202f
+#define AMOOR_D_MATCH 0x4000302f
+#define AMOOR_W_MATCH 0x4000202f
+#define AMOSWAP_D_MATCH 0x0800302f
+#define AMOSWAP_W_MATCH 0x0800202f
+#define AMOXOR_D_MATCH 0x2000302f
+#define AMOXOR_W_MATCH 0x2000202f
+
 void set_resume(struct resume_data *data)
 {
 	gd->arch.resume = data;
@@ -115,6 +145,184 @@ static void show_code(ulong epc)
 		printf("%04x%s", pos[i], i + 1 == len ? ")\n" : " ");
 }
 
+static ulong get_reg(struct pt_regs *regs, int reg_num)
+{
+	switch (reg_num) {
+	case 0:
+		return 0;
+	case 1:
+		return regs->ra;
+	case 2:
+		return regs->sp;
+	case 3:
+		return regs->gp;
+	case 4:
+		return regs->tp;
+	case 5:
+		return regs->t0;
+	case 6:
+		return regs->t1;
+	case 7:
+		return regs->t2;
+	case 8:
+		return regs->s0;
+	case 9:
+		return regs->s1;
+	case 10:
+		return regs->a0;
+	case 11:
+		return regs->a1;
+	case 12:
+		return regs->a2;
+	case 13:
+		return regs->a3;
+	case 14:
+		return regs->a4;
+	case 15:
+		return regs->a5;
+	case 16:
+		return regs->a6;
+	case 17:
+		return regs->a7;
+	case 18:
+		return regs->s2;
+	case 19:
+		return regs->s3;
+	case 20:
+		return regs->s4;
+	case 21:
+		return regs->s5;
+	case 22:
+		return regs->s6;
+	case 23:
+		return regs->s7;
+	case 24:
+		return regs->s8;
+	case 25:
+		return regs->s9;
+	case 26:
+		return regs->s10;
+	case 27:
+		return regs->s11;
+	case 28:
+		return regs->t3;
+	case 29:
+		return regs->t4;
+	case 30:
+		return regs->t5;
+	case 31:
+		return regs->t6;
+	default:
+		printf("Error reg_num=%d for %s\n", reg_num, __func__);
+		break;
+	}
+	return 0;
+}
+
+static void set_reg(struct pt_regs *regs, int reg_num, ulong reg_value)
+{
+	switch (reg_num) {
+	case 0:
+		break;
+	case 1:
+		regs->ra = reg_value;
+		break;
+	case 2:
+		regs->sp = reg_value;
+		break;
+	case 3:
+		regs->gp = reg_value;
+		break;
+	case 4:
+		regs->tp = reg_value;
+		break;
+	case 5:
+		regs->t0 = reg_value;
+		break;
+	case 6:
+		regs->t1 = reg_value;
+		break;
+	case 7:
+		regs->t2 = reg_value;
+		break;
+	case 8:
+		regs->s0 = reg_value;
+		break;
+	case 9:
+		regs->s1 = reg_value;
+		break;
+	case 10:
+		regs->a0 = reg_value;
+		break;
+	case 11:
+		regs->a1 = reg_value;
+		break;
+	case 12:
+		regs->a2 = reg_value;
+		break;
+	case 13:
+		regs->a3 = reg_value;
+		break;
+	case 14:
+		regs->a4 = reg_value;
+		break;
+	case 15:
+		regs->a5 = reg_value;
+		break;
+	case 16:
+		regs->a6 = reg_value;
+		break;
+	case 17:
+		regs->a7 = reg_value;
+		break;
+	case 18:
+		regs->s2 = reg_value;
+		break;
+	case 19:
+		regs->s3 = reg_value;
+		break;
+	case 20:
+		regs->s4 = reg_value;
+		break;
+	case 21:
+		regs->s5 = reg_value;
+		break;
+	case 22:
+		regs->s6 = reg_value;
+		break;
+	case 23:
+		regs->s7 = reg_value;
+		break;
+	case 24:
+		regs->s8 = reg_value;
+		break;
+	case 25:
+		regs->s9 = reg_value;
+		break;
+	case 26:
+		regs->s10 = reg_value;
+		break;
+	case 27:
+		regs->s11 = reg_value;
+		break;
+	case 28:
+		regs->t3 = reg_value;
+		break;
+	case 29:
+		regs->t4 = reg_value;
+		break;
+	case 30:
+		regs->t5 = reg_value;
+		break;
+	case 31:
+		regs->t6 = reg_value;
+		break;
+	default:
+		printf("Error reg_num=%d for %s\n", reg_num, __func__);
+		break;
+	}
+}
+
 static void _exit_trap(ulong code, ulong epc, ulong tval, struct pt_regs *regs)
 {
 	static const char * const exception_code[] = {
@@ -140,6 +348,97 @@ static void _exit_trap(ulong code, ulong epc, ulong tval, struct pt_regs *regs)
 		gd->arch.resume->code = code;
 		longjmp(gd->arch.resume->jump, 1);
 	}
+	if (IS_ENABLED(CONFIG_P8700_RISCV) && code == ILLEGAL_INSTRUCTION) {
+		// Fetch one 16-bit op at a time to deal with 16-bit alignment.
+		// FIXME! For the big-endian mode, we need to swap bytes.
+		unsigned short op0 = *(unsigned short *)epc;
+		unsigned short op1 = *((unsigned short *)epc + 1);
+		unsigned int opcode = (op1 << 16) | op0;
+		//int aqrl = (opcode & AQRL_MASK) >> AQRL_SHIFT;
+		int rs2 = (opcode & RS2_MASK) >> RS2_SHIFT;
+		int rs1 = (opcode & RS1_MASK) >> RS1_SHIFT;
+		int rd = (opcode & RD_MASK) >> RD_SHIFT;
+		ulong rs2_value = get_reg(regs, rs2);
+		ulong rs1_value = get_reg(regs, rs1);
+		ulong rd_value = 0;
+
+		switch (opcode & AMO_MASK) {
+		case AMOADD_D_MATCH:
+			rd_value = atomic_add_d(rs1_value, rs2_value);
+			set_reg(regs, rd, rd_value);
+
+		case AMOADD_W_MATCH:
+			rd_value = atomic_add_w(rs1_value, rs2_value);
+			set_reg(regs, rd, rd_value);
+
+		case AMOAND_D_MATCH:
+			rd_value = atomic_and_d(rs1_value, rs2_value);
+			set_reg(regs, rd, rd_value);
+
+		case AMOAND_W_MATCH:
+			rd_value = atomic_and_w(rs1_value, rs2_value);
+			set_reg(regs, rd, rd_value);
+
+		case AMOMAX_D_MATCH:
+			rd_value = atomic_max_d(rs1_value, rs2_value);
+			set_reg(regs, rd, rd_value);
+
+		case AMOMAX_W_MATCH:
+			rd_value = atomic_max_w(rs1_value, rs2_value);
+			set_reg(regs, rd, rd_value);
+
+		case AMOMAXU_D_MATCH:
+			rd_value = atomic_maxu_d(rs1_value, rs2_value);
+			set_reg(regs, rd, rd_value);
+
+		case AMOMAXU_W_MATCH:
+			rd_value = atomic_maxu_w(rs1_value, rs2_value);
+			set_reg(regs, rd, rd_value);
+
+		case AMOMIN_D_MATCH:
+			rd_value = atomic_min_d(rs1_value, rs2_value);
+			set_reg(regs, rd, rd_value);
+
+		case AMOMIN_W_MATCH:
+			rd_value = atomic_min_w(rs1_value, rs2_value);
+			set_reg(regs, rd, rd_value);
+
+		case AMOMINU_D_MATCH:
+			rd_value = atomic_minu_d(rs1_value, rs2_value);
+			set_reg(regs, rd, rd_value);
+
+		case AMOMINU_W_MATCH:
+			rd_value = atomic_minu_w(rs1_value, rs2_value);
+			set_reg(regs, rd, rd_value);
+
+		case AMOOR_D_MATCH:
+			rd_value = atomic_or_d(rs1_value, rs2_value);
+			set_reg(regs, rd, rd_value);
+
+		case AMOOR_W_MATCH:
+			rd_value = atomic_or_w(rs1_value, rs2_value);
+			set_reg(regs, rd, rd_value);
+
+		case AMOSWAP_D_MATCH:
+			rd_value = atomic_swap_d(rs1_value, rs2_value);
+			set_reg(regs, rd, rd_value);
+
+		case AMOSWAP_W_MATCH:
+			rd_value = atomic_swap_w(rs1_value, rs2_value);
+			set_reg(regs, rd, rd_value);
+
+		case AMOXOR_D_MATCH:
+			rd_value = atomic_xor_d(rs1_value, rs2_value);
+			set_reg(regs, rd, rd_value);
+
+		case AMOXOR_W_MATCH:
+			rd_value = atomic_xor_w(rs1_value, rs2_value);
+			set_reg(regs, rd, rd_value);
+
+		default:
+			break;
+		}
+	}
 
 	if (code < ARRAY_SIZE(exception_code))
 		printf("Unhandled exception: %s\n", exception_code[code]);
diff --git a/include/interrupt.h b/include/interrupt.h
index 6ea28b54a56..5fc983afccb 100644
--- a/include/interrupt.h
+++ b/include/interrupt.h
@@ -43,3 +43,22 @@ struct resume_data {
  * Return:	0 before an exception, 1 after an exception occurred
  */
 void set_resume(struct resume_data *data);
+
+ulong atomic_swap_w(ulong val, ulong addr);
+ulong atomic_swap_d(ulong val, ulong addr);
+ulong atomic_add_w(ulong val, ulong addr);
+ulong atomic_add_d(ulong val, ulong addr);
+ulong atomic_and_w(ulong val, ulong addr);
+ulong atomic_and_d(ulong val, ulong addr);
+ulong atomic_or_w(ulong val, ulong addr);
+ulong atomic_or_d(ulong val, ulong addr);
+ulong atomic_xor_w(ulong val, ulong addr);
+ulong atomic_xor_d(ulong val, ulong addr);
+ulong atomic_max_w(ulong val, ulong addr);
+ulong atomic_max_d(ulong val, ulong addr);
+ulong atomic_maxu_w(ulong val, ulong addr);
+ulong atomic_maxu_d(ulong val, ulong addr);
+ulong atomic_min_w(ulong val, ulong addr);
+ulong atomic_min_d(ulong val, ulong addr);
+ulong atomic_minu_w(ulong val, ulong addr);
+ulong atomic_minu_d(ulong val, ulong addr);
-- 
2.34.1


More information about the U-Boot mailing list