Merge branch 'main' of github.com:davidharrishmc/riscv-wally into main

This commit is contained in:
Ross Thompson 2022-01-12 13:29:19 -06:00
commit f18684efbf
30 changed files with 15588 additions and 176 deletions

3
.gitmodules vendored
View File

@ -8,3 +8,6 @@
[submodule "addins/imperas-riscv-tests"]
path = addins/imperas-riscv-tests
url = https://github.com/riscv-ovpsim/imperas-riscv-tests
[submodule "addins/riscv-tests"]
path = addins/riscv-tests
url = https://github.com/riscv-software-src/riscv-tests

1
addins/riscv-tests Submodule

@ -0,0 +1 @@
Subproject commit cf04274f50621fd9ef9147793cca6dd1657985c7

24
examples/C/common/LICENSE Normal file
View File

@ -0,0 +1,24 @@
Copyright (c) 2012-2015, The Regents of the University of California (Regents).
All Rights Reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of the Regents nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
IN NO EVENT SHALL REGENTS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING
OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF REGENTS HAS
BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
REGENTS SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE SOFTWARE AND ACCOMPANYING DOCUMENTATION, IF ANY, PROVIDED
HEREUNDER IS PROVIDED "AS IS". REGENTS HAS NO OBLIGATION TO PROVIDE
MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.

1
examples/C/common/README Normal file
View File

@ -0,0 +1 @@
These files are from github.com/riscv-software-src/riscv-tests

225
examples/C/common/crt.S Normal file
View File

@ -0,0 +1,225 @@
# See LICENSE for license details.
#include "encoding.h"
#if __riscv_xlen == 64
# define LREG ld
# define SREG sd
# define REGBYTES 8
#else
# define LREG lw
# define SREG sw
# define REGBYTES 4
#endif
.section ".text.init"
.globl _start
_start:
li x1, 0
li x2, 0
li x3, 0
li x4, 0
li x5, 0
li x6, 0
li x7, 0
li x8, 0
li x9, 0
li x10,0
li x11,0
li x12,0
li x13,0
li x14,0
li x15,0
li x16,0
li x17,0
li x18,0
li x19,0
li x20,0
li x21,0
li x22,0
li x23,0
li x24,0
li x25,0
li x26,0
li x27,0
li x28,0
li x29,0
li x30,0
li x31,0
# enable FPU and accelerator if present
li t0, MSTATUS_FS | MSTATUS_XS
csrs mstatus, t0
# make sure XLEN agrees with compilation choice
li t0, 1
slli t0, t0, 31
#if __riscv_xlen == 64
bgez t0, 1f
#else
bltz t0, 1f
#endif
2:
li a0, 1
sw a0, tohost, t0
j 2b
1:
#ifdef __riscv_flen
# initialize FPU if we have one
la t0, 1f
csrw mtvec, t0
fssr x0
fmv.s.x f0, x0
fmv.s.x f1, x0
fmv.s.x f2, x0
fmv.s.x f3, x0
fmv.s.x f4, x0
fmv.s.x f5, x0
fmv.s.x f6, x0
fmv.s.x f7, x0
fmv.s.x f8, x0
fmv.s.x f9, x0
fmv.s.x f10,x0
fmv.s.x f11,x0
fmv.s.x f12,x0
fmv.s.x f13,x0
fmv.s.x f14,x0
fmv.s.x f15,x0
fmv.s.x f16,x0
fmv.s.x f17,x0
fmv.s.x f18,x0
fmv.s.x f19,x0
fmv.s.x f20,x0
fmv.s.x f21,x0
fmv.s.x f22,x0
fmv.s.x f23,x0
fmv.s.x f24,x0
fmv.s.x f25,x0
fmv.s.x f26,x0
fmv.s.x f27,x0
fmv.s.x f28,x0
fmv.s.x f29,x0
fmv.s.x f30,x0
fmv.s.x f31,x0
1:
#endif
# initialize trap vector
la t0, trap_entry
csrw mtvec, t0
# initialize global pointer
.option push
.option norelax
la gp, __global_pointer$
.option pop
la tp, _end + 63
and tp, tp, -64
# get core id
csrr a0, mhartid
# for now, assume only 1 core
li a1, 1
1:bgeu a0, a1, 1b
# give each core 128KB of stack + TLS
#define STKSHIFT 17
add sp, a0, 1
sll sp, sp, STKSHIFT
add sp, sp, tp
sll a2, a0, STKSHIFT
add tp, tp, a2
j _init
.align 2
trap_entry:
addi sp, sp, -272
SREG x1, 1*REGBYTES(sp)
SREG x2, 2*REGBYTES(sp)
SREG x3, 3*REGBYTES(sp)
SREG x4, 4*REGBYTES(sp)
SREG x5, 5*REGBYTES(sp)
SREG x6, 6*REGBYTES(sp)
SREG x7, 7*REGBYTES(sp)
SREG x8, 8*REGBYTES(sp)
SREG x9, 9*REGBYTES(sp)
SREG x10, 10*REGBYTES(sp)
SREG x11, 11*REGBYTES(sp)
SREG x12, 12*REGBYTES(sp)
SREG x13, 13*REGBYTES(sp)
SREG x14, 14*REGBYTES(sp)
SREG x15, 15*REGBYTES(sp)
SREG x16, 16*REGBYTES(sp)
SREG x17, 17*REGBYTES(sp)
SREG x18, 18*REGBYTES(sp)
SREG x19, 19*REGBYTES(sp)
SREG x20, 20*REGBYTES(sp)
SREG x21, 21*REGBYTES(sp)
SREG x22, 22*REGBYTES(sp)
SREG x23, 23*REGBYTES(sp)
SREG x24, 24*REGBYTES(sp)
SREG x25, 25*REGBYTES(sp)
SREG x26, 26*REGBYTES(sp)
SREG x27, 27*REGBYTES(sp)
SREG x28, 28*REGBYTES(sp)
SREG x29, 29*REGBYTES(sp)
SREG x30, 30*REGBYTES(sp)
SREG x31, 31*REGBYTES(sp)
csrr a0, mcause
csrr a1, mepc
mv a2, sp
jal handle_trap
csrw mepc, a0
# Remain in M-mode after eret
li t0, MSTATUS_MPP
csrs mstatus, t0
LREG x1, 1*REGBYTES(sp)
LREG x2, 2*REGBYTES(sp)
LREG x3, 3*REGBYTES(sp)
LREG x4, 4*REGBYTES(sp)
LREG x5, 5*REGBYTES(sp)
LREG x6, 6*REGBYTES(sp)
LREG x7, 7*REGBYTES(sp)
LREG x8, 8*REGBYTES(sp)
LREG x9, 9*REGBYTES(sp)
LREG x10, 10*REGBYTES(sp)
LREG x11, 11*REGBYTES(sp)
LREG x12, 12*REGBYTES(sp)
LREG x13, 13*REGBYTES(sp)
LREG x14, 14*REGBYTES(sp)
LREG x15, 15*REGBYTES(sp)
LREG x16, 16*REGBYTES(sp)
LREG x17, 17*REGBYTES(sp)
LREG x18, 18*REGBYTES(sp)
LREG x19, 19*REGBYTES(sp)
LREG x20, 20*REGBYTES(sp)
LREG x21, 21*REGBYTES(sp)
LREG x22, 22*REGBYTES(sp)
LREG x23, 23*REGBYTES(sp)
LREG x24, 24*REGBYTES(sp)
LREG x25, 25*REGBYTES(sp)
LREG x26, 26*REGBYTES(sp)
LREG x27, 27*REGBYTES(sp)
LREG x28, 28*REGBYTES(sp)
LREG x29, 29*REGBYTES(sp)
LREG x30, 30*REGBYTES(sp)
LREG x31, 31*REGBYTES(sp)
addi sp, sp, 272
mret
.section ".tohost","aw",@progbits
.align 6
.globl tohost
tohost: .dword 0
.align 6
.globl fromhost
fromhost: .dword 0

2832
examples/C/common/encoding.h Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,469 @@
// See LICENSE for license details.
#include <stdint.h>
#include <string.h>
#include <stdarg.h>
#include <stdio.h>
#include <limits.h>
#include <sys/signal.h>
#include "util.h"
#define SYS_write 64
#undef strcmp
extern volatile uint64_t tohost;
extern volatile uint64_t fromhost;
static uintptr_t syscall(uintptr_t which, uint64_t arg0, uint64_t arg1, uint64_t arg2)
{
volatile uint64_t magic_mem[8] __attribute__((aligned(64)));
magic_mem[0] = which;
magic_mem[1] = arg0;
magic_mem[2] = arg1;
magic_mem[3] = arg2;
__sync_synchronize();
tohost = (uintptr_t)magic_mem;
while (fromhost == 0)
;
fromhost = 0;
__sync_synchronize();
return magic_mem[0];
}
#define NUM_COUNTERS 2
static uintptr_t counters[NUM_COUNTERS];
static char* counter_names[NUM_COUNTERS];
void setStats(int enable)
{
int i = 0;
#define READ_CTR(name) do { \
while (i >= NUM_COUNTERS) ; \
uintptr_t csr = read_csr(name); \
if (!enable) { csr -= counters[i]; counter_names[i] = #name; } \
counters[i++] = csr; \
} while (0)
READ_CTR(mcycle);
READ_CTR(minstret);
#undef READ_CTR
}
void __attribute__((noreturn)) tohost_exit(uintptr_t code)
{
tohost = (code << 1) | 1;
while (1);
}
uintptr_t __attribute__((weak)) handle_trap(uintptr_t cause, uintptr_t epc, uintptr_t regs[32])
{
tohost_exit(1337);
}
void exit(int code)
{
tohost_exit(code);
}
void abort()
{
exit(128 + SIGABRT);
}
void printstr(const char* s)
{
syscall(SYS_write, 1, (uintptr_t)s, strlen(s));
}
void __attribute__((weak)) thread_entry(int cid, int nc)
{
// multi-threaded programs override this function.
// for the case of single-threaded programs, only let core 0 proceed.
while (cid != 0);
}
int __attribute__((weak)) main(int argc, char** argv)
{
// single-threaded programs override this function.
printstr("Implement main(), foo!\n");
return -1;
}
static void init_tls()
{
register void* thread_pointer asm("tp");
extern char _tdata_begin, _tdata_end, _tbss_end;
size_t tdata_size = &_tdata_end - &_tdata_begin;
memcpy(thread_pointer, &_tdata_begin, tdata_size);
size_t tbss_size = &_tbss_end - &_tdata_end;
memset(thread_pointer + tdata_size, 0, tbss_size);
}
void _init(int cid, int nc)
{
init_tls();
thread_entry(cid, nc);
// only single-threaded programs should ever get here.
int ret = main(0, 0);
char buf[NUM_COUNTERS * 32] __attribute__((aligned(64)));
char* pbuf = buf;
for (int i = 0; i < NUM_COUNTERS; i++)
if (counters[i])
pbuf += sprintf(pbuf, "%s = %d\n", counter_names[i], counters[i]);
if (pbuf != buf)
printstr(buf);
exit(ret);
}
#undef putchar
int putchar(int ch)
{
static __thread char buf[64] __attribute__((aligned(64)));
static __thread int buflen = 0;
buf[buflen++] = ch;
if (ch == '\n' || buflen == sizeof(buf))
{
syscall(SYS_write, 1, (uintptr_t)buf, buflen);
buflen = 0;
}
return 0;
}
void printhex(uint64_t x)
{
char str[17];
int i;
for (i = 0; i < 16; i++)
{
str[15-i] = (x & 0xF) + ((x & 0xF) < 10 ? '0' : 'a'-10);
x >>= 4;
}
str[16] = 0;
printstr(str);
}
static inline void printnum(void (*putch)(int, void**), void **putdat,
unsigned long long num, unsigned base, int width, int padc)
{
unsigned digs[sizeof(num)*CHAR_BIT];
int pos = 0;
while (1)
{
digs[pos++] = num % base;
if (num < base)
break;
num /= base;
}
while (width-- > pos)
putch(padc, putdat);
while (pos-- > 0)
putch(digs[pos] + (digs[pos] >= 10 ? 'a' - 10 : '0'), putdat);
}
static unsigned long long getuint(va_list *ap, int lflag)
{
if (lflag >= 2)
return va_arg(*ap, unsigned long long);
else if (lflag)
return va_arg(*ap, unsigned long);
else
return va_arg(*ap, unsigned int);
}
static long long getint(va_list *ap, int lflag)
{
if (lflag >= 2)
return va_arg(*ap, long long);
else if (lflag)
return va_arg(*ap, long);
else
return va_arg(*ap, int);
}
static void vprintfmt(void (*putch)(int, void**), void **putdat, const char *fmt, va_list ap)
{
register const char* p;
const char* last_fmt;
register int ch, err;
unsigned long long num;
int base, lflag, width, precision, altflag;
char padc;
while (1) {
while ((ch = *(unsigned char *) fmt) != '%') {
if (ch == '\0')
return;
fmt++;
putch(ch, putdat);
}
fmt++;
// Process a %-escape sequence
last_fmt = fmt;
padc = ' ';
width = -1;
precision = -1;
lflag = 0;
altflag = 0;
reswitch:
switch (ch = *(unsigned char *) fmt++) {
// flag to pad on the right
case '-':
padc = '-';
goto reswitch;
// flag to pad with 0's instead of spaces
case '0':
padc = '0';
goto reswitch;
// width field
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
for (precision = 0; ; ++fmt) {
precision = precision * 10 + ch - '0';
ch = *fmt;
if (ch < '0' || ch > '9')
break;
}
goto process_precision;
case '*':
precision = va_arg(ap, int);
goto process_precision;
case '.':
if (width < 0)
width = 0;
goto reswitch;
case '#':
altflag = 1;
goto reswitch;
process_precision:
if (width < 0)
width = precision, precision = -1;
goto reswitch;
// long flag (doubled for long long)
case 'l':
lflag++;
goto reswitch;
// character
case 'c':
putch(va_arg(ap, int), putdat);
break;
// string
case 's':
if ((p = va_arg(ap, char *)) == NULL)
p = "(null)";
if (width > 0 && padc != '-')
for (width -= strnlen(p, precision); width > 0; width--)
putch(padc, putdat);
for (; (ch = *p) != '\0' && (precision < 0 || --precision >= 0); width--) {
putch(ch, putdat);
p++;
}
for (; width > 0; width--)
putch(' ', putdat);
break;
// (signed) decimal
case 'd':
num = getint(&ap, lflag);
if ((long long) num < 0) {
putch('-', putdat);
num = -(long long) num;
}
base = 10;
goto signed_number;
// unsigned decimal
case 'u':
base = 10;
goto unsigned_number;
// (unsigned) octal
case 'o':
// should do something with padding so it's always 3 octits
base = 8;
goto unsigned_number;
// pointer
case 'p':
static_assert(sizeof(long) == sizeof(void*));
lflag = 1;
putch('0', putdat);
putch('x', putdat);
/* fall through to 'x' */
// (unsigned) hexadecimal
case 'x':
base = 16;
unsigned_number:
num = getuint(&ap, lflag);
signed_number:
printnum(putch, putdat, num, base, width, padc);
break;
// escaped '%' character
case '%':
putch(ch, putdat);
break;
// unrecognized escape sequence - just print it literally
default:
putch('%', putdat);
fmt = last_fmt;
break;
}
}
}
int printf(const char* fmt, ...)
{
va_list ap;
va_start(ap, fmt);
vprintfmt((void*)putchar, 0, fmt, ap);
va_end(ap);
return 0; // incorrect return value, but who cares, anyway?
}
int sprintf(char* str, const char* fmt, ...)
{
va_list ap;
char* str0 = str;
va_start(ap, fmt);
void sprintf_putch(int ch, void** data)
{
char** pstr = (char**)data;
**pstr = ch;
(*pstr)++;
}
vprintfmt(sprintf_putch, (void**)&str, fmt, ap);
*str = 0;
va_end(ap);
return str - str0;
}
void* memcpy(void* dest, const void* src, size_t len)
{
if ((((uintptr_t)dest | (uintptr_t)src | len) & (sizeof(uintptr_t)-1)) == 0) {
const uintptr_t* s = src;
uintptr_t *d = dest;
while (d < (uintptr_t*)(dest + len))
*d++ = *s++;
} else {
const char* s = src;
char *d = dest;
while (d < (char*)(dest + len))
*d++ = *s++;
}
return dest;
}
void* memset(void* dest, int byte, size_t len)
{
if ((((uintptr_t)dest | len) & (sizeof(uintptr_t)-1)) == 0) {
uintptr_t word = byte & 0xFF;
word |= word << 8;
word |= word << 16;
word |= word << 16 << 16;
uintptr_t *d = dest;
while (d < (uintptr_t*)(dest + len))
*d++ = word;
} else {
char *d = dest;
while (d < (char*)(dest + len))
*d++ = byte;
}
return dest;
}
size_t strlen(const char *s)
{
const char *p = s;
while (*p)
p++;
return p - s;
}
size_t strnlen(const char *s, size_t n)
{
const char *p = s;
while (n-- && *p)
p++;
return p - s;
}
int strcmp(const char* s1, const char* s2)
{
unsigned char c1, c2;
do {
c1 = *s1++;
c2 = *s2++;
} while (c1 != 0 && c1 == c2);
return c1 - c2;
}
char* strcpy(char* dest, const char* src)
{
char* d = dest;
while ((*d++ = *src++))
;
return dest;
}
long atol(const char* str)
{
long res = 0;
int sign = 0;
while (*str == ' ')
str++;
if (*str == '-' || *str == '+') {
sign = *str == '-';
str++;
}
while (*str) {
res *= 10;
res += *str++ - '0';
}
return sign ? -res : res;
}

66
examples/C/common/test.ld Normal file
View File

@ -0,0 +1,66 @@
/*======================================================================*/
/* Proxy kernel linker script */
/*======================================================================*/
/* This is the linker script used when building the proxy kernel. */
/*----------------------------------------------------------------------*/
/* Setup */
/*----------------------------------------------------------------------*/
/* The OUTPUT_ARCH command specifies the machine architecture where the
argument is one of the names used in the BFD library. More
specifically one of the entires in bfd/cpu-mips.c */
OUTPUT_ARCH( "riscv" )
ENTRY(_start)
/*----------------------------------------------------------------------*/
/* Sections */
/*----------------------------------------------------------------------*/
SECTIONS
{
/* text: test code section */
. = 0x80000000;
.text.init : { *(.text.init) }
. = ALIGN(0x1000);
.tohost : { *(.tohost) }
. = ALIGN(0x1000);
.text : { *(.text) }
/* data segment */
.data : { *(.data) }
.sdata : {
__global_pointer$ = . + 0x800;
*(.srodata.cst16) *(.srodata.cst8) *(.srodata.cst4) *(.srodata.cst2) *(.srodata*)
*(.sdata .sdata.* .gnu.linkonce.s.*)
}
/* bss segment */
.sbss : {
*(.sbss .sbss.* .gnu.linkonce.sb.*)
*(.scommon)
}
.bss : { *(.bss) }
/* thread-local data segment */
.tdata :
{
_tdata_begin = .;
*(.tdata)
_tdata_end = .;
}
.tbss :
{
*(.tbss)
_tbss_end = .;
}
/* End of uninitalized data segement */
_end = .;
}

90
examples/C/common/util.h Normal file
View File

@ -0,0 +1,90 @@
// See LICENSE for license details.
#ifndef __UTIL_H
#define __UTIL_H
extern void setStats(int enable);
#include <stdint.h>
#define static_assert(cond) switch(0) { case 0: case !!(long)(cond): ; }
static int verify(int n, const volatile int* test, const int* verify)
{
int i;
// Unrolled for faster verification
for (i = 0; i < n/2*2; i+=2)
{
int t0 = test[i], t1 = test[i+1];
int v0 = verify[i], v1 = verify[i+1];
if (t0 != v0) return i+1;
if (t1 != v1) return i+2;
}
if (n % 2 != 0 && test[n-1] != verify[n-1])
return n;
return 0;
}
static int verifyDouble(int n, const volatile double* test, const double* verify)
{
int i;
// Unrolled for faster verification
for (i = 0; i < n/2*2; i+=2)
{
double t0 = test[i], t1 = test[i+1];
double v0 = verify[i], v1 = verify[i+1];
int eq1 = t0 == v0, eq2 = t1 == v1;
if (!(eq1 & eq2)) return i+1+eq1;
}
if (n % 2 != 0 && test[n-1] != verify[n-1])
return n;
return 0;
}
static void __attribute__((noinline)) barrier(int ncores)
{
static volatile int sense;
static volatile int count;
static __thread int threadsense;
__sync_synchronize();
threadsense = !threadsense;
if (__sync_fetch_and_add(&count, 1) == ncores-1)
{
count = 0;
sense = threadsense;
}
else while(sense != threadsense)
;
__sync_synchronize();
}
static uint64_t lfsr(uint64_t x)
{
uint64_t bit = (x ^ (x >> 1)) & 1;
return (x >> 1) | (bit << 62);
}
static uintptr_t insn_len(uintptr_t pc)
{
return (*(unsigned short*)pc & 3) ? 4 : 2;
}
#ifdef __riscv
#include "encoding.h"
#endif
#define stringify_1(s) #s
#define stringify(s) stringify_1(s)
#define stats(code, iter) do { \
unsigned long _c = -read_csr(mcycle), _i = -read_csr(minstret); \
code; \
_c += read_csr(mcycle), _i += read_csr(minstret); \
if (cid == 0) \
printf("\n%s: %ld cycles, %ld.%ld cycles/iter, %ld.%ld CPI\n", \
stringify(code), _c, _c/iter, 10*_c/iter%10, _c/_i, 10*_c/_i%10); \
} while(0)
#endif //__UTIL_H

16
examples/C/mm/Makefile Normal file
View File

@ -0,0 +1,16 @@
TARGET = mm
$(TARGET).objdump: $(TARGET)
riscv64-unknown-elf-objdump -S -D $(TARGET) > $(TARGET).objdump
$(TARGET): $(TARGET).c Makefile
riscv64-unknown-elf-gcc -g -o $(TARGET) -march=rv64gc -mabi=lp64d -mcmodel=medany \
-DPREALLOCATE=1 -mcmodel=medany -static -std=gnu99 -O2 -ffast-math -fno-common \
-fno-builtin-printf -fno-tree-loop-distribute-patterns \
-static -nostdlib -nostartfiles -lm -lgcc -T../common/test.ld \
-I../common \
-O *.c \
../common/crt.S ../common/syscalls.c
clean:
rm -f $(TARGET) $(TARGET).objdump

36
examples/C/mm/common.h Normal file
View File

@ -0,0 +1,36 @@
// See LICENSE for license details.
#ifndef _MM_H
#define _MM_H
#include <string.h>
#include <stdint.h>
#include <math.h>
#ifdef SP
typedef float t;
#define fma fmaf
#else
typedef double t;
#endif
#define inline inline __attribute__((always_inline))
#define alloca_aligned(s, a) ((void*)(((uintptr_t)alloca((s)+(a)-1)+(a)-1)&~((a)-1)))
#include "rb.h"
#ifdef __cplusplus
extern "C" {
#endif
void mm(size_t m, size_t n, size_t p,
t* a, size_t lda, t* b, size_t ldb, t* c, size_t ldc);
#ifdef __cplusplus
}
#endif
//void rb(t* a, t* b, t* c, size_t lda, size_t ldb, size_t ldc);
#endif

BIN
examples/C/mm/mm Executable file

Binary file not shown.

152
examples/C/mm/mm.c Normal file
View File

@ -0,0 +1,152 @@
// See LICENSE for license details.
#include "common.h"
#include <assert.h>
#include <math.h>
#include <stdint.h>
#include <alloca.h>
#define MIN(a, b) ((a) < (b) ? (a) : (b))
static void mm_naive(size_t m, size_t n, size_t p,
t* a, size_t lda, t* b, size_t ldb, t* c, size_t ldc)
{
for (size_t i = 0; i < m; i++)
{
for (size_t j = 0; j < n; j++)
{
t s0 = c[i*ldc+j], s1 = 0, s2 = 0, s3 = 0;
for (size_t k = 0; k < p/4*4; k+=4)
{
s0 = fma(a[i*lda+k+0], b[(k+0)*ldb+j], s0);
s1 = fma(a[i*lda+k+1], b[(k+1)*ldb+j], s1);
s2 = fma(a[i*lda+k+2], b[(k+2)*ldb+j], s2);
s3 = fma(a[i*lda+k+3], b[(k+3)*ldb+j], s3);
}
for (size_t k = p/4*4; k < p; k++)
s0 = fma(a[i*lda+k], b[k*ldb+j], s0);
c[i*ldc+j] = (s0 + s1) + (s2 + s3);
}
}
}
static inline void mm_rb(size_t m, size_t n, size_t p,
t* a, size_t lda, t* b, size_t ldb, t* c, size_t ldc)
{
size_t mb = m/RBM*RBM, nb = n/RBN*RBN;
for (size_t i = 0; i < mb; i += RBM)
{
for (size_t j = 0; j < nb; j += RBN)
kloop(p, a+i*lda, lda, b+j, ldb, c+i*ldc+j, ldc);
mm_naive(RBM, n - nb, p, a+i*lda, lda, b+nb, ldb, c+i*ldc+nb, ldc);
}
mm_naive(m - mb, n, p, a+mb*lda, lda, b, ldb, c+mb*ldc, ldc);
}
static inline void repack(t* a, size_t lda, const t* a0, size_t lda0, size_t m, size_t p)
{
for (size_t i = 0; i < m; i++)
{
for (size_t j = 0; j < p/8*8; j+=8)
{
t t0 = a0[i*lda0+j+0];
t t1 = a0[i*lda0+j+1];
t t2 = a0[i*lda0+j+2];
t t3 = a0[i*lda0+j+3];
t t4 = a0[i*lda0+j+4];
t t5 = a0[i*lda0+j+5];
t t6 = a0[i*lda0+j+6];
t t7 = a0[i*lda0+j+7];
a[i*lda+j+0] = t0;
a[i*lda+j+1] = t1;
a[i*lda+j+2] = t2;
a[i*lda+j+3] = t3;
a[i*lda+j+4] = t4;
a[i*lda+j+5] = t5;
a[i*lda+j+6] = t6;
a[i*lda+j+7] = t7;
}
for (size_t j = p/8*8; j < p; j++)
a[i*lda+j] = a0[i*lda0+j];
}
}
static void mm_cb(size_t m, size_t n, size_t p,
t* a, size_t lda, t* b, size_t ldb, t* c, size_t ldc)
{
size_t nmb = m/CBM, nnb = n/CBN, npb = p/CBK;
size_t mb = nmb*CBM, nb = nnb*CBN, pb = npb*CBK;
//t a1[mb*pb], b1[pb*nb], c1[mb*nb];
t* a1 = (t*)alloca_aligned(sizeof(t)*mb*pb, 8192);
t* b1 = (t*)alloca_aligned(sizeof(t)*pb*nb, 8192);
t* c1 = (t*)alloca_aligned(sizeof(t)*mb*nb, 8192);
for (size_t i = 0; i < mb; i += CBM)
for (size_t j = 0; j < pb; j += CBK)
repack(a1 + (npb*(i/CBM) + j/CBK)*(CBM*CBK), CBK, a + i*lda + j, lda, CBM, CBK);
for (size_t i = 0; i < pb; i += CBK)
for (size_t j = 0; j < nb; j += CBN)
repack(b1 + (nnb*(i/CBK) + j/CBN)*(CBK*CBN), CBN, b + i*ldb + j, ldb, CBK, CBN);
for (size_t i = 0; i < mb; i += CBM)
for (size_t j = 0; j < nb; j += CBN)
repack(c1 + (nnb*(i/CBM) + j/CBN)*(CBM*CBN), CBN, c + i*ldc + j, ldc, CBM, CBN);
for (size_t i = 0; i < mb; i += CBM)
{
for (size_t j = 0; j < nb; j += CBN)
{
for (size_t k = 0; k < pb; k += CBK)
{
mm_rb(CBM, CBN, CBK,
a1 + (npb*(i/CBM) + k/CBK)*(CBM*CBK), CBK,
b1 + (nnb*(k/CBK) + j/CBN)*(CBK*CBN), CBN,
c1 + (nnb*(i/CBM) + j/CBN)*(CBM*CBN), CBN);
}
if (pb < p)
{
mm_rb(CBM, CBN, p - pb,
a + i*lda + pb, lda,
b + pb*ldb + j, ldb,
c1 + (nnb*(i/CBM) + j/CBN)*(CBM*CBN), CBN);
}
}
if (nb < n)
{
for (size_t k = 0; k < p; k += CBK)
{
mm_rb(CBM, n - nb, MIN(p - k, CBK),
a + i*lda + k, lda,
b + k*ldb + nb, ldb,
c + i*ldc + nb, ldc);
}
}
}
if (mb < m)
{
for (size_t j = 0; j < n; j += CBN)
{
for (size_t k = 0; k < p; k += CBK)
{
mm_rb(m - mb, MIN(n - j, CBN), MIN(p - k, CBK),
a + mb*lda + k, lda,
b + k*ldb + j, ldb,
c + mb*ldc + j, ldc);
}
}
}
for (size_t i = 0; i < mb; i += CBM)
for (size_t j = 0; j < nb; j += CBN)
repack(c + i*ldc + j, ldc, c1 + (nnb*(i/CBM) + j/CBN)*(CBM*CBN), CBN, CBM, CBN);
}
void mm(size_t m, size_t n, size_t p,
t* a, size_t lda, t* b, size_t ldb, t* c, size_t ldc)
{
if (__builtin_expect(m <= 2*CBM && n <= 2*CBN && p <= 2*CBK, 1))
mm_rb(m, n, p, a, lda, b, ldb, c, ldc);
else
mm_cb(m, n, p, a, lda, b, ldb, c, ldc);
}

76
examples/C/mm/mm_main.c Normal file
View File

@ -0,0 +1,76 @@
// See LICENSE for license details.
#include "common.h"
#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include "util.h"
#pragma GCC optimize ("unroll-loops")
int main(int argc, char** argv)
//void thread_entry(int cid, int nc)
{
const int R = 8;
int m, n, p;
uint64_t s = 0xdeadbeefU;
int cid = 0;
int nc = 0;
m = CBM;
n = CBN;
p = CBK;
t a[m*p];
t b[p*n];
t c[m*n];
for (size_t i = 0; i < m; i++)
for (size_t j = 0; j < p; j++)
a[i*p+j] = (t)(s = lfsr(s));
for (size_t i = 0; i < p; i++)
for (size_t j = 0; j < n; j++)
b[i*n+j] = (t)(s = lfsr(s));
memset(c, 0, m*n*sizeof(c[0]));
size_t instret, cycles;
for (int i = 0; i < R; i++)
{
instret = -read_csr(minstret);
cycles = -read_csr(mcycle);
mm(m, n, p, a, p, b, n, c, n);
instret += read_csr(minstret);
cycles += read_csr(mcycle);
}
asm volatile("fence");
printf("C%d: reg block %dx%dx%d, cache block %dx%dx%d\n",
cid, RBM, RBN, RBK, CBM, CBN, CBK);
printf("C%d: %d instructions\n", cid, (int)(instret));
printf("C%d: %d cycles\n", cid, (int)(cycles));
printf("C%d: %d flops\n", cid, 2*m*n*p);
printf("C%d: %d Mflops @ 1 GHz\n", cid, 2000*m*n*p/(cycles));
#if 1
for (size_t i = 0; i < m; i++)
{
for (size_t j = 0; j < n; j++)
{
t s = 0;
for (size_t k = 0; k < p; k++)
s += a[i*p+k] * b[k*n+j];
s *= R;
if (fabs(c[i*n+j]-s) > fabs(1e-6*s))
{
printf("C%d: c[%lu][%lu] %f != %f\n", cid, i, j, c[i*n+j], s);
exit(1);
}
}
}
#endif
//barrier(nc);
exit(0);
}

View File

@ -0,0 +1,81 @@
import scala.sys.process._
object MMGen {
implicit def i2s(i: Int) = i.toString
def writeFile(name: String, contents: String) = {
val f = new java.io.FileWriter(name)
f.write(contents)
f.close
}
var indent = 0
def spacing = " " * indent
def assign(lhs: String, rhs: String) =
spacing + lhs + " = " + rhs + ";\n"
def init(t: String, n: String, v: String) =
assign(t+" "+n, v)
def open_block(s: String = "") = {
val result = (if (s != "") spacing + s else "") + spacing + "{\n"
indent = indent + 1
result
}
def close_block = {
indent = indent - 1
spacing + "}\n"
}
def ar(m: String, i: String) = m+"["+i+"]"
def r(a: String, b: String*) = (a :: b.toList).reduceLeft(_+"_"+_)
def rb(m: Int, n: Int, p: Int) = {
var s = open_block("static inline void kloop(size_t p, t* a0, size_t lda, t* b0, size_t ldb, t* c, size_t ldc)\n")
for (i <- 0 until m)
s += init("t*", r("c", i), "&"+ar("c", "ldc*"+i))
for (i <- 0 until m; j <- 0 until n)
s += init("t", r("c", i, j), ar(r("c", i), j))
def doit(m: Int, n: Int, p: Int) = {
for (i <- 0 until m)
s += init("t*", r("a", i), "&"+ar("a", "lda*"+i))
for (k <- 0 until p)
s += init("t*", r("b", k), "&"+ar("b", "ldb*"+k))
for (k <- 0 until p; i <- 0 until m; j <- 0 until n)
s += assign(r("c", i, j), "fma(" + ar(r("a", i), k) + ", " + ar(r("b", k), j) + ", " + r("c", i, j) + ")")
}
s += open_block("for (t *a = a0, *b = b0; a < a0 + p/RBK*RBK; a += RBK, b += RBK*ldb)\n")
doit(m, n, p)
s += close_block
s += open_block("for (t *a = a0 + p/RBK*RBK, *b = b0 + p/RBK*RBK*ldb; a < a0 + p; a++, b += ldb)\n")
doit(m, n, 1)
s += close_block
for (i <- 0 until m; j <- 0 until n)
s += assign(ar(r("c", i), j), r("c", i, j))
s += close_block
s
}
def gcd(a: Int, b: Int): Int = if (b == 0) a else gcd(b, a%b)
def lcm(a: Int, b: Int): Int = a*b/gcd(a, b)
def lcm(a: Seq[Int]): Int = {
if (a.tail.isEmpty) a.head
else lcm(a.head, lcm(a.tail))
}
def test1(m: Int, n: Int, p: Int, m1: Int, n1: Int, p1: Int) = {
val decl = "static const int RBM = "+m+", RBN = "+n+", RBK = "+p+";\n" +
"static const int CBM = "+m1+", CBN = "+n1+", CBK = "+p1+";\n"
writeFile("rb.h", decl + rb(m, n, p))
//"make"!!
"make run"!
("cp a.out " + Seq("b", m, n, p, m1, n1, p1, "run").reduce(_+"."+_))!
}
def main(args: Array[String]): Unit = {
test1(4, 5, 6, 24, 25, 24)
//for (i <- 4 to 6; j <- 4 to 6; k <- 4 to 6)
// test1(i, j, k, if (i == 5) 35 else 36, if (j == 5) 35 else 36, if (k == 5) 35 else 36)
}
}

210
examples/C/mm/rb.h Normal file
View File

@ -0,0 +1,210 @@
static const int RBM = 4, RBN = 5, RBK = 6;
static const int CBM = 24, CBN = 25, CBK = 24;
static inline void kloop(size_t p, t* a0, size_t lda, t* b0, size_t ldb, t* c, size_t ldc)
{
t* c_0 = &c[ldc*0];
t* c_1 = &c[ldc*1];
t* c_2 = &c[ldc*2];
t* c_3 = &c[ldc*3];
t c_0_0 = c_0[0];
t c_0_1 = c_0[1];
t c_0_2 = c_0[2];
t c_0_3 = c_0[3];
t c_0_4 = c_0[4];
t c_1_0 = c_1[0];
t c_1_1 = c_1[1];
t c_1_2 = c_1[2];
t c_1_3 = c_1[3];
t c_1_4 = c_1[4];
t c_2_0 = c_2[0];
t c_2_1 = c_2[1];
t c_2_2 = c_2[2];
t c_2_3 = c_2[3];
t c_2_4 = c_2[4];
t c_3_0 = c_3[0];
t c_3_1 = c_3[1];
t c_3_2 = c_3[2];
t c_3_3 = c_3[3];
t c_3_4 = c_3[4];
for (t *a = a0, *b = b0; a < a0 + p/RBK*RBK; a += RBK, b += RBK*ldb)
{
t* a_0 = &a[lda*0];
t* a_1 = &a[lda*1];
t* a_2 = &a[lda*2];
t* a_3 = &a[lda*3];
t* b_0 = &b[ldb*0];
t* b_1 = &b[ldb*1];
t* b_2 = &b[ldb*2];
t* b_3 = &b[ldb*3];
t* b_4 = &b[ldb*4];
t* b_5 = &b[ldb*5];
c_0_0 = fma(a_0[0], b_0[0], c_0_0);
c_0_1 = fma(a_0[0], b_0[1], c_0_1);
c_0_2 = fma(a_0[0], b_0[2], c_0_2);
c_0_3 = fma(a_0[0], b_0[3], c_0_3);
c_0_4 = fma(a_0[0], b_0[4], c_0_4);
c_1_0 = fma(a_1[0], b_0[0], c_1_0);
c_1_1 = fma(a_1[0], b_0[1], c_1_1);
c_1_2 = fma(a_1[0], b_0[2], c_1_2);
c_1_3 = fma(a_1[0], b_0[3], c_1_3);
c_1_4 = fma(a_1[0], b_0[4], c_1_4);
c_2_0 = fma(a_2[0], b_0[0], c_2_0);
c_2_1 = fma(a_2[0], b_0[1], c_2_1);
c_2_2 = fma(a_2[0], b_0[2], c_2_2);
c_2_3 = fma(a_2[0], b_0[3], c_2_3);
c_2_4 = fma(a_2[0], b_0[4], c_2_4);
c_3_0 = fma(a_3[0], b_0[0], c_3_0);
c_3_1 = fma(a_3[0], b_0[1], c_3_1);
c_3_2 = fma(a_3[0], b_0[2], c_3_2);
c_3_3 = fma(a_3[0], b_0[3], c_3_3);
c_3_4 = fma(a_3[0], b_0[4], c_3_4);
c_0_0 = fma(a_0[1], b_1[0], c_0_0);
c_0_1 = fma(a_0[1], b_1[1], c_0_1);
c_0_2 = fma(a_0[1], b_1[2], c_0_2);
c_0_3 = fma(a_0[1], b_1[3], c_0_3);
c_0_4 = fma(a_0[1], b_1[4], c_0_4);
c_1_0 = fma(a_1[1], b_1[0], c_1_0);
c_1_1 = fma(a_1[1], b_1[1], c_1_1);
c_1_2 = fma(a_1[1], b_1[2], c_1_2);
c_1_3 = fma(a_1[1], b_1[3], c_1_3);
c_1_4 = fma(a_1[1], b_1[4], c_1_4);
c_2_0 = fma(a_2[1], b_1[0], c_2_0);
c_2_1 = fma(a_2[1], b_1[1], c_2_1);
c_2_2 = fma(a_2[1], b_1[2], c_2_2);
c_2_3 = fma(a_2[1], b_1[3], c_2_3);
c_2_4 = fma(a_2[1], b_1[4], c_2_4);
c_3_0 = fma(a_3[1], b_1[0], c_3_0);
c_3_1 = fma(a_3[1], b_1[1], c_3_1);
c_3_2 = fma(a_3[1], b_1[2], c_3_2);
c_3_3 = fma(a_3[1], b_1[3], c_3_3);
c_3_4 = fma(a_3[1], b_1[4], c_3_4);
c_0_0 = fma(a_0[2], b_2[0], c_0_0);
c_0_1 = fma(a_0[2], b_2[1], c_0_1);
c_0_2 = fma(a_0[2], b_2[2], c_0_2);
c_0_3 = fma(a_0[2], b_2[3], c_0_3);
c_0_4 = fma(a_0[2], b_2[4], c_0_4);
c_1_0 = fma(a_1[2], b_2[0], c_1_0);
c_1_1 = fma(a_1[2], b_2[1], c_1_1);
c_1_2 = fma(a_1[2], b_2[2], c_1_2);
c_1_3 = fma(a_1[2], b_2[3], c_1_3);
c_1_4 = fma(a_1[2], b_2[4], c_1_4);
c_2_0 = fma(a_2[2], b_2[0], c_2_0);
c_2_1 = fma(a_2[2], b_2[1], c_2_1);
c_2_2 = fma(a_2[2], b_2[2], c_2_2);
c_2_3 = fma(a_2[2], b_2[3], c_2_3);
c_2_4 = fma(a_2[2], b_2[4], c_2_4);
c_3_0 = fma(a_3[2], b_2[0], c_3_0);
c_3_1 = fma(a_3[2], b_2[1], c_3_1);
c_3_2 = fma(a_3[2], b_2[2], c_3_2);
c_3_3 = fma(a_3[2], b_2[3], c_3_3);
c_3_4 = fma(a_3[2], b_2[4], c_3_4);
c_0_0 = fma(a_0[3], b_3[0], c_0_0);
c_0_1 = fma(a_0[3], b_3[1], c_0_1);
c_0_2 = fma(a_0[3], b_3[2], c_0_2);
c_0_3 = fma(a_0[3], b_3[3], c_0_3);
c_0_4 = fma(a_0[3], b_3[4], c_0_4);
c_1_0 = fma(a_1[3], b_3[0], c_1_0);
c_1_1 = fma(a_1[3], b_3[1], c_1_1);
c_1_2 = fma(a_1[3], b_3[2], c_1_2);
c_1_3 = fma(a_1[3], b_3[3], c_1_3);
c_1_4 = fma(a_1[3], b_3[4], c_1_4);
c_2_0 = fma(a_2[3], b_3[0], c_2_0);
c_2_1 = fma(a_2[3], b_3[1], c_2_1);
c_2_2 = fma(a_2[3], b_3[2], c_2_2);
c_2_3 = fma(a_2[3], b_3[3], c_2_3);
c_2_4 = fma(a_2[3], b_3[4], c_2_4);
c_3_0 = fma(a_3[3], b_3[0], c_3_0);
c_3_1 = fma(a_3[3], b_3[1], c_3_1);
c_3_2 = fma(a_3[3], b_3[2], c_3_2);
c_3_3 = fma(a_3[3], b_3[3], c_3_3);
c_3_4 = fma(a_3[3], b_3[4], c_3_4);
c_0_0 = fma(a_0[4], b_4[0], c_0_0);
c_0_1 = fma(a_0[4], b_4[1], c_0_1);
c_0_2 = fma(a_0[4], b_4[2], c_0_2);
c_0_3 = fma(a_0[4], b_4[3], c_0_3);
c_0_4 = fma(a_0[4], b_4[4], c_0_4);
c_1_0 = fma(a_1[4], b_4[0], c_1_0);
c_1_1 = fma(a_1[4], b_4[1], c_1_1);
c_1_2 = fma(a_1[4], b_4[2], c_1_2);
c_1_3 = fma(a_1[4], b_4[3], c_1_3);
c_1_4 = fma(a_1[4], b_4[4], c_1_4);
c_2_0 = fma(a_2[4], b_4[0], c_2_0);
c_2_1 = fma(a_2[4], b_4[1], c_2_1);
c_2_2 = fma(a_2[4], b_4[2], c_2_2);
c_2_3 = fma(a_2[4], b_4[3], c_2_3);
c_2_4 = fma(a_2[4], b_4[4], c_2_4);
c_3_0 = fma(a_3[4], b_4[0], c_3_0);
c_3_1 = fma(a_3[4], b_4[1], c_3_1);
c_3_2 = fma(a_3[4], b_4[2], c_3_2);
c_3_3 = fma(a_3[4], b_4[3], c_3_3);
c_3_4 = fma(a_3[4], b_4[4], c_3_4);
c_0_0 = fma(a_0[5], b_5[0], c_0_0);
c_0_1 = fma(a_0[5], b_5[1], c_0_1);
c_0_2 = fma(a_0[5], b_5[2], c_0_2);
c_0_3 = fma(a_0[5], b_5[3], c_0_3);
c_0_4 = fma(a_0[5], b_5[4], c_0_4);
c_1_0 = fma(a_1[5], b_5[0], c_1_0);
c_1_1 = fma(a_1[5], b_5[1], c_1_1);
c_1_2 = fma(a_1[5], b_5[2], c_1_2);
c_1_3 = fma(a_1[5], b_5[3], c_1_3);
c_1_4 = fma(a_1[5], b_5[4], c_1_4);
c_2_0 = fma(a_2[5], b_5[0], c_2_0);
c_2_1 = fma(a_2[5], b_5[1], c_2_1);
c_2_2 = fma(a_2[5], b_5[2], c_2_2);
c_2_3 = fma(a_2[5], b_5[3], c_2_3);
c_2_4 = fma(a_2[5], b_5[4], c_2_4);
c_3_0 = fma(a_3[5], b_5[0], c_3_0);
c_3_1 = fma(a_3[5], b_5[1], c_3_1);
c_3_2 = fma(a_3[5], b_5[2], c_3_2);
c_3_3 = fma(a_3[5], b_5[3], c_3_3);
c_3_4 = fma(a_3[5], b_5[4], c_3_4);
}
for (t *a = a0 + p/RBK*RBK, *b = b0 + p/RBK*RBK*ldb; a < a0 + p; a++, b += ldb)
{
t* a_0 = &a[lda*0];
t* a_1 = &a[lda*1];
t* a_2 = &a[lda*2];
t* a_3 = &a[lda*3];
t* b_0 = &b[ldb*0];
c_0_0 = fma(a_0[0], b_0[0], c_0_0);
c_0_1 = fma(a_0[0], b_0[1], c_0_1);
c_0_2 = fma(a_0[0], b_0[2], c_0_2);
c_0_3 = fma(a_0[0], b_0[3], c_0_3);
c_0_4 = fma(a_0[0], b_0[4], c_0_4);
c_1_0 = fma(a_1[0], b_0[0], c_1_0);
c_1_1 = fma(a_1[0], b_0[1], c_1_1);
c_1_2 = fma(a_1[0], b_0[2], c_1_2);
c_1_3 = fma(a_1[0], b_0[3], c_1_3);
c_1_4 = fma(a_1[0], b_0[4], c_1_4);
c_2_0 = fma(a_2[0], b_0[0], c_2_0);
c_2_1 = fma(a_2[0], b_0[1], c_2_1);
c_2_2 = fma(a_2[0], b_0[2], c_2_2);
c_2_3 = fma(a_2[0], b_0[3], c_2_3);
c_2_4 = fma(a_2[0], b_0[4], c_2_4);
c_3_0 = fma(a_3[0], b_0[0], c_3_0);
c_3_1 = fma(a_3[0], b_0[1], c_3_1);
c_3_2 = fma(a_3[0], b_0[2], c_3_2);
c_3_3 = fma(a_3[0], b_0[3], c_3_3);
c_3_4 = fma(a_3[0], b_0[4], c_3_4);
}
c_0[0] = c_0_0;
c_0[1] = c_0_1;
c_0[2] = c_0_2;
c_0[3] = c_0_3;
c_0[4] = c_0_4;
c_1[0] = c_1_0;
c_1[1] = c_1_1;
c_1[2] = c_1_2;
c_1[3] = c_1_3;
c_1[4] = c_1_4;
c_2[0] = c_2_0;
c_2[1] = c_2_1;
c_2[2] = c_2_2;
c_2[3] = c_2_3;
c_2[4] = c_2_4;
c_3[0] = c_3_0;
c_3[1] = c_3_1;
c_3[2] = c_3_2;
c_3[3] = c_3_3;
c_3[4] = c_3_4;
}

View File

@ -3,12 +3,14 @@ TARGET = simple
$(TARGET).objdump: $(TARGET)
riscv64-unknown-elf-objdump -S -D $(TARGET) > $(TARGET).objdump
$(TARGET): $(TARGET).c
$(TARGET): $(TARGET).c Makefile
riscv64-unknown-elf-gcc -g -o $(TARGET) -march=rv64gc -mabi=lp64d -mcmodel=medany \
-O $(TARGET).c
# -O -T../../link/linkc.ld $(TARGET).c
# -nostartfiles -nostdlib $(TARGET).c
# -nostartfiles -nostdlib -T../../link/link.ld $(TARGET).c
-DPREALLOCATE=1 -mcmodel=medany -static -std=gnu99 -O2 -ffast-math -fno-common \
-fno-builtin-printf -fno-tree-loop-distribute-patterns \
-static -nostdlib -nostartfiles -lm -lgcc -T../common/test.ld \
-I../common \
-O0 $(TARGET).c \
../common/crt.S ../common/syscalls.c comp1.s
clean:
rm -f $(TARGET) $(TARGET).objdump

10
examples/C/simple/comp1.s Normal file
View File

@ -0,0 +1,10 @@
.global _comp1
_comp1:
li a0, 42
ret

BIN
examples/C/simple/simple Executable file

Binary file not shown.

View File

@ -2,13 +2,39 @@
// David_Harris@hmc.edu 24 December 2021
// Simple illustration of compiling C code
//#include <stdio.h>
#include "util.h"
extern int printf(const char* fmt, ...);
extern int _comp1(void);
/*
long sum(long N) {
long result, i;
result = 0;
for (i=1; i<=N; i++) result = result + i;
return result;
for (i=1; i<=N; i++) {
result = result + i;
}
return result;
int a;
asm volatile ("li s0, 10;");
asm volatile(
"li %0, 3;"
// "csrrs %0, 0xF14, zero" //CSSRS rd, mhartid, 0
: "=r"(a) //output
: //input
: //clobbered
);
return a;
}
*/
int main(void) {
return sum(4);
}
int s[1], expected[1];
// s[0] = sum(4);
s[0] = _comp1();
printf("s = %d\n", s[0]);
expected[0] = 42;
return verify(1, s, expected); // 0 means success
}

11009
examples/C/simple/simple.dis Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,35 @@
// simple.C
// David_Harris@hmc.edu 24 December 2021
// Simple illustration of compiling C code
//#include <stdio.h>
#include "util.h"
extern int printf(const char* fmt, ...);
long sum(long N) {
/* long result, i;
result = 0;
for (i=1; i<=N; i++) {
result = result + i;
}
return result; */
int a;
// asm volatile ("li s0, 10;");
asm volatile(
"li %0, 10"
// "csrrs %0, 0xF14, zero" //CSSRS rd, mhartid, 0
: "=r"(a) //output
: //input
: //clobbered
);
return a;
}
int main(void) {
int s[1], expected[1];
s[0] = sum(4);
printf("s = %d\n", s[0]);
expected[0] = 10;
return verify(1, s, expected); // 0 means success
}

27
examples/C/sum/Makefile Normal file
View File

@ -0,0 +1,27 @@
TARGET = sum
$(TARGET).objdump: $(TARGET)
riscv64-unknown-elf-objdump -S $(TARGET) > $(TARGET).objdump
$(TARGET): $(TARGET).c Makefile
riscv64-unknown-elf-gcc -o $(TARGET) -g -O \
-march=rv64gc -mabi=lp64d -mcmodel=medany \
-nostartfiles -T../common/test.ld -I../common \
$(TARGET).c ../common/crt.S ../common/syscalls.c
# Compiler flags:
# -o $(TARGET) defines the name of the output file
# -g generates debugging symbols for gdb
# -O turns on basic optimization
# -march=rv64gc -mabi=lp64d =mcmodel=medany generates code for RV64GC with doubles and long/ptrs = 64 bits
# -nostartfiles avoids inserting standard startup files because we are using crt.s
# -T specifies the linker file
# -I specifies the include path (e.g. for util.h)
# The last line defines the C files to compile.
# crt.S is needed as our startup file to initialize the processor
# syscalls.c implements printf through the HTIF for Spike
# other flags from riscv-tests makefiles that don't seem to be important
# -ffast-math -DPREALLOCATE=1 -std=gnu99 -fno-tree-loop-distribute-patterns
# -fno-common -static -fno-builtin-printf -nostdlib -lm -lgcc
clean:
rm -f $(TARGET) $(TARGET).objdump

23
examples/C/sum/sum.c Normal file
View File

@ -0,0 +1,23 @@
// sum.C
// David_Harris@hmc.edu 24 December 2021
// Simple illustration of compiling C code
#include <stdio.h> // supports printf
#include "util.h" // supports verify
long sum(long N) {
long result, i;
result = 0;
for (i=1; i<=N; i++) {
result = result + i;
}
return result;
}
int main(void) {
int s[1], expected[1];
s[0] = sum(4);
printf("s = %d\n", s[0]);
expected[0] = 10;
return verify(1, s, expected); // 0 means success
}

View File

@ -1,17 +0,0 @@
OUTPUT_ARCH( "riscv" )
ENTRY(main)
SECTIONS
{
. = 0x80000000;
.text : { *(.text) }
. = ALIGN(0x1000);
.tohost : { *(.tohost) }
. = ALIGN(0x1000);
.data : { *(.data) }
.data.string : { *(.data.string)}
. = ALIGN(0x1000);
.bss : { *(.bss) }
_end = .;
}

View File

@ -1475,8 +1475,8 @@ string imperas32f[] = '{
`WALLYTEST,
"rv64i_m/privilege/WALLY-MMU-SV39", "30A0",
"rv64i_m/privilege/WALLY-MMU-SV48", "30A0",
"rv64i_m/privilege/WALLY-PMP", "30A0"
// "rv64i_m/privilege/WALLY-PMA", "30A0",
"rv64i_m/privilege/WALLY-PMP", "30A0",
"rv64i_m/privilege/WALLY-PMA", "30A0"
};
string wally64periph[] = '{
@ -1491,8 +1491,8 @@ string wally32i[] = '{
string wally32priv[] = '{
`WALLYTEST,
"rv32i_m/privilege/WALLY-MMU-SV32", "3080",
"rv32i_m/privilege/WALLY-PMP", "3080"
// "rv32i_m/privilege/WALLY-PMA", "3080"
"rv32i_m/privilege/WALLY-PMP", "3080",
"rv32i_m/privilege/WALLY-PMA", "3080"
};
string wally32periph[] = '{

View File

@ -1,13 +1,9 @@
beef00b4
beef00b5
000000b6
000000b7
ffffffb7
00000001
00000bad
00000007
00000005
00000bad
beef00b9
00000002
00000007
00000005
00000bad
@ -22,15 +18,9 @@ beef00b9
00000007
00000005
00000bad
00000007
00000005
00000bad
000000bf
ffffffbf
00000001
00000bad
00000007
00000005
00000bad
beef00c1
00000007
00000005
@ -40,27 +30,37 @@ beef00c1
00000bad
00000001
00000bad
beef00c4
000000c5
000000c6
00000007
00000005
00000bad
00000001
00000bad
00000009
deadbeef
deadbeef
deadbeef
deadbeef
deadbeef
deadbeef
deadbeef
deadbeef
deadbeef
deadbeef
deadbeef
deadbeef
00000007
00000005
00000bad
00000001
00000bad
00000007
00000005
00000bad
00000001
00000bad
00000007
00000005
00000bad
00000001
00000bad
00000007
00000005
00000bad
00000001
00000bad
00000007
00000005
00000bad
00000001
00000bad
0000000b
deadbeef
deadbeef
deadbeef

View File

@ -21,6 +21,19 @@
// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
///////////////////////////////////////////
#define BOOTROM_BASE 0x00001000
#define BOOTROM_RANGE 0x00000FFF
#define RAM_BASE 0x80000000
#define RAM_RANGE 0x7FFFFFFF
#define CLINT_BASE 0x02000000
#define CLINT_RANGE 0x0000FFFF
#define GPIO_BASE 0x10012000
#define GPIO_RANGE 0x000000FF
#define UART_BASE 0x10000000
#define UART_RANGE 0x00000007
#define PLIC_BASE 0x0C000000
#define PLIC_RANGE 0x03FFFFFF
#include "WALLY-TEST-LIB-32.S"
// Test library includes and handler for each type of test, a trap handler, imperas compliance instructions
// Ideally this should mean that a test can be written by simply adding .4byte statements as below.
@ -48,92 +61,92 @@
# | PLIC | 0xC000000 | 32-bit | YES | YES | NO | NO | NO | NO |
# | UART0 | 0x10000000 | 8-bit | YES | YES | NO | NO | NO | NO |
# | GPIO | 0x1012000 | 32-bit | YES | YES | NO | NO | NO | NO |
# | DRAM | 0x80000000 | Any | YES | YES | YES | YES | YES | YES |
# ************** Cacheable, Idempotent, Atomic tests are not implemented yet.
# ----------------- ROM ---------------------
# *** the rom is read only and these read tests depend on reading a known value out of memory.
# Is there some guaranteed value that I can read out of the ROM
# otherwise the read test can be modified to just check that the read happened,
# not necessarily that it got a known value out of memory. This feels hacky and Id be interested in other options.
# ROM goes untested because it isn't writeable and these tests rely on writing a known value to memory.
# .4byte 0x1000, 0xBEEF0001, 0x0 # 32-bit write: store access fault
# .4byte 0x1000, 0xBEEF0001, 0x1 # 32-bit read: success
# .4byte 0x1000, 0xBEEF0002, 0x12 # 16-bit write: store access fault
# .4byte 0x1000, 0xBEEF0002, 0x15 # 16-bit read: success
# .4byte 0x1000, 0xBEEF0003, 0x13 # 08-bit write: store access fault
# .4byte 0x1000, 0xBEEF0003, 0x16 # 08-bit read: success
# # *** similar problem with the execute tests. Impossible to write the needed executable code into rom once the program's running
# .4byte 0x1000, 0x111, 0x2 # execute: success
# ----------------- CLINT ---------------------
.4byte 0x2000000, 0xBEEF00B5, 0x0 # 32-bit write: success
.4byte 0x2000000, 0xBEEF00B5, 0x1 # 32-bit read: success
.4byte 0x2000000, 0xBEEF00B6, 0x12 # 16-bit write: success
.4byte 0x2000000, 0xBEEF00B6, 0x15 # 16-bit read: success
.4byte 0x2000000, 0xBEEF00B7, 0x13 # 08-bit write: success
.4byte 0x2000000, 0xBEEF00B7, 0x16 # 08-bit read: success
# Use timecmp register as readable and writable section of the CLINT
.4byte CLINT_BASE + 0x4000, 0xBEEF00B5, 0x0 # 32-bit write: success
.4byte CLINT_BASE + 0x4000, 0xBEEF00B5, 0x1 # 32-bit read: success
.4byte CLINT_BASE + 0x4000, 0xBEEF00B6, 0x12 # 16-bit write: success
.4byte CLINT_BASE + 0x4000, 0xBEEF00B6, 0x15 # 16-bit read: success
.4byte CLINT_BASE + 0x4000, 0xBEEF00B7, 0x13 # 08-bit write: success
.4byte CLINT_BASE + 0x4000, 0xBEEF00B7, 0x16 # 08-bit read: success
.4byte 0x2000000, 0xbad, 0x2 # execute: instruction access fault
.4byte CLINT_BASE, 0xbad, 0x2 # execute: instruction access fault
# ----------------- PLIC ---------------------
.4byte 0xC000000, 0xBEEF00B9, 0x0 # 32-bit write: success
.4byte 0xC000000, 0xBEEF00B9, 0x1 # 32-bit read: success
.4byte 0xC000000, 0xBEEF00BA, 0x12 # 16-bit write: store access fault
.4byte 0xC000000, 0xBEEF00BA, 0x15 # 16-bit read: load access fault
.4byte 0xC000000, 0xBEEF00BB, 0x13 # 08-bit write: store access fault
.4byte 0xC000000, 0xBEEF00BB, 0x16 # 08-bit read: load access fault
# Write 0x2 instead of wider value to plic address because the register width might change.
.4byte PLIC_BASE + 0x2000, 0x2, 0x0 # 32-bit write: success
.4byte PLIC_BASE + 0x2000, 0x2, 0x1 # 32-bit read: success
.4byte PLIC_BASE, 0xBEEF00BA, 0x12 # 16-bit write: store access fault
.4byte PLIC_BASE, 0xBEEF00BA, 0x15 # 16-bit read: load access fault
.4byte PLIC_BASE, 0xBEEF00BB, 0x13 # 08-bit write: store access fault
.4byte PLIC_BASE, 0xBEEF00BB, 0x16 # 08-bit read: load access fault
.4byte 0xC000000, 0xbad, 0x2 # execute: instruction access fault
.4byte PLIC_BASE, 0xbad, 0x2 # execute: instruction access fault
# ----------------- UART0 ---------------------
.4byte 0x10000000, 0xBEEF00BD, 0x0 # 32-bit write: store access fault
.4byte 0x10000000, 0xBEEF00BD, 0x1 # 32-bit read: load access fault
.4byte 0x10000000, 0xBEEF00BE, 0x12 # 16-bit write: store access fault
.4byte 0x10000000, 0xBEEF00BE, 0x15 # 16-bit read: load access fault
.4byte 0x10000000, 0xBEEF00BF, 0x13 # 08-bit write: success
.4byte 0x10000000, 0xBEEF00BF, 0x16 # 08-bit read: success
.4byte UART_BASE, 0xBEEF00BD, 0x0 # 32-bit write: store access fault
.4byte UART_BASE, 0xBEEF00BD, 0x1 # 32-bit read: load access fault
.4byte UART_BASE, 0xBEEF00BE, 0x12 # 16-bit write: store access fault
.4byte UART_BASE, 0xBEEF00BE, 0x15 # 16-bit read: load access fault
# Different address for this test so that we write into a writable register in the uart.
.4byte UART_BASE + 0x3, 0xBEEF00BF, 0x13 # 08-bit write: success
.4byte UART_BASE + 0x3, 0xBEEF00BF, 0x16 # 08-bit read: success
.4byte 0x10000000, 0xbad, 0x2 # execute: instruction access fault
.4byte UART_BASE, 0xbad, 0x2 # execute: instruction access fault
# ----------------- GPIO ---------------------
.4byte 0x1012000, 0xBEEF00C1, 0x0 # 32-bit write: success
.4byte 0x1012000, 0xBEEF00C1, 0x1 # 32-bit read: success
.4byte 0x1012000, 0xBEEF00C2, 0x12 # 16-bit write: store access fault
.4byte 0x1012000, 0xBEEF00C2, 0x15 # 16-bit read: load access fault
.4byte 0x1012000, 0xBEEF00C3, 0x13 # 08-bit write: store access fault
.4byte 0x1012000, 0xBEEF00C3, 0x16 # 08-bit read: load access fault
.4byte GPIO_BASE + 0x8, 0xBEEF00C1, 0x0 # 32-bit write: success
.4byte GPIO_BASE + 0x8, 0xBEEF00C1, 0x1 # 32-bit read: success
.4byte GPIO_BASE, 0xBEEF00C2, 0x12 # 16-bit write: store access fault
.4byte GPIO_BASE, 0xBEEF00C2, 0x15 # 16-bit read: load access fault
.4byte GPIO_BASE, 0xBEEF00C3, 0x13 # 08-bit write: store access fault
.4byte GPIO_BASE, 0xBEEF00C3, 0x16 # 08-bit read: load access fault
.4byte 0x1012000, 0xbad, 0x2 # execute: instruction access fault
.4byte GPIO_BASE, 0xbad, 0x2 # execute: instruction access fault
# ----------------- DRAM ---------------------
# the following is already tested by the fact that this test runs without error:
# 32 bit reads and writes into DRAM,
# Execution in DRAM
# offset by 0xf000 to avoid overwriting the program
.4byte 0x8000F000, 0xBEEF00C5, 0x12 # 16-bit write: success
.4byte 0x8000F000, 0xBEEF00C5, 0x15 # 16-bit read: success
.4byte 0x8000F000, 0xBEEF00C6, 0x13 # 08-bit write: success
.4byte 0x8000F000, 0xBEEF00C6, 0x16 # 08-bit read: success
# ----------------- Inaccessible ---------------------
# show that load, store, and jalr cause faults in a region not defined by PMAs.
# *** should I go through every possible inaccessible region of memory or is one just fine?
.4byte 0xD000000, 0xBEEF00C7, 0x0 # 32-bit write: store access fault
.4byte 0xD000000, 0xBEEF00C7, 0x1 # 32-bit read: load access fault
.4byte 0x1000, 0x111, 0x2 # execute: instruction access fault
# Tests 'random' place in unimplemented memory
.4byte 0x40000000, 0xBEEF00C7, 0x0 # 32-bit write: store access fault
.4byte 0x40000000, 0xBEEF00C7, 0x1 # 32-bit read: load access fault
.4byte 0x40000000, 0x111, 0x2 # execute: instruction access fault
.4byte 0x0, 0x0, 0x3 // terminate tests
# Tests just past the end of each peripheral
.4byte (BOOTROM_BASE+BOOTROM_RANGE+1), 0xBEEF00C8, 0x0 # 32-bit write: store access fault
.4byte (BOOTROM_BASE+BOOTROM_RANGE+1), 0xBEEF00C8, 0x1 # 32-bit read: load access fault
.4byte (BOOTROM_BASE+BOOTROM_RANGE+1), 0x111, 0x2 # execute: instruction access fault
.4byte (CLINT_BASE+CLINT_RANGE+1), 0xBEEF00C9, 0x0 # 32-bit write: store access fault
.4byte (CLINT_BASE+CLINT_RANGE+1), 0xBEEF00C9, 0x1 # 32-bit read: load access fault
.4byte (CLINT_BASE+CLINT_RANGE+1), 0x111, 0x2 # execute: instruction access fault
.4byte (PLIC_BASE+PLIC_RANGE+1), 0xBEEF00CA, 0x0 # 32-bit write: store access fault
.4byte (PLIC_BASE+PLIC_RANGE+1), 0xBEEF00CA, 0x1 # 32-bit read: load access fault
.4byte (PLIC_BASE+PLIC_RANGE+1), 0x111, 0x2 # execute: instruction access fault
.4byte (UART_BASE+UART_RANGE+1), 0xBEEF00CB, 0x13 # 08-bit write: store access fault
.4byte (UART_BASE+UART_RANGE+1), 0xBEEF00CB, 0x16 # 08-bit read: load access fault
.4byte (UART_BASE+UART_RANGE+1), 0x111, 0x2 # execute: instruction access fault
.4byte (GPIO_BASE+GPIO_RANGE+1), 0xBEEF00CC, 0x0 # 32-bit write: store access fault
.4byte (GPIO_BASE+GPIO_RANGE+1), 0xBEEF00CC, 0x1 # 32-bit read: load access fault
.4byte (GPIO_BASE+GPIO_RANGE+1), 0x111, 0x2 # execute: instruction access fault
.4byte 0x0, 0x0, 0x3 # terminate tests

View File

@ -1,11 +1,11 @@
beef00b4
0000dead
beef00b5
00000000
ffffffff
000000b6
00000000
000000b7
00000000
ffffffb7
ffffffff
00000001
00000000
00000bad
@ -16,7 +16,7 @@ beef00b5
00000000
00000bad
00000000
beef00b9
00000002
00000000
00000007
00000000
@ -52,8 +52,8 @@ beef00b9
00000000
00000bad
00000000
000000bf
00000000
ffffffbf
ffffffff
00000001
00000000
00000bad
@ -65,7 +65,7 @@ beef00b9
00000bad
00000000
beef00c1
00000000
ffffffff
00000007
00000000
00000005

View File

@ -21,7 +21,7 @@
// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
///////////////////////////////////////////
#define BOOTROM_BASE 0x00001000 // spec had been 0x1000 to 0x2FFF, but dh truncated to 0x1000 to 0x1FFF because upper half seems to be all zeros and this is easier for decoder
#define BOOTROM_BASE 0x00001000
#define BOOTROM_RANGE 0x00000FFF
#define RAM_BASE 0x80000000
#define RAM_RANGE 0x7FFFFFFF
@ -70,88 +70,90 @@
# ----------------- CLINT ---------------------
.8byte CLINT_BASE, 0xEEEEEEEEEEEEEEEE, 0x0 # 64-bit write: success
.8byte CLINT_BASE, 0x0000DEADBEEF00B4, 0x1 # 64-bit read: success
.8byte CLINT_BASE, 0x0000DEADBEEF00B5, 0x11 # 32-bit write: success
.8byte CLINT_BASE, 0x0000DEADBEEF00B5, 0x14 # 32-bit read: success
.8byte CLINT_BASE, 0x0000DEADBEEF00B6, 0x12 # 16-bit write: success
.8byte CLINT_BASE, 0x0000DEADBEEF00B6, 0x15 # 16-bit read: success
.8byte CLINT_BASE, 0x0000DEADBEEF00B7, 0x13 # 08-bit write: success
.8byte CLINT_BASE, 0x0000DEADBEEF00B7, 0x16 # 08-bit read: success
# Use timecmp register as readable and writable section of the CLINT
.8byte CLINT_BASE + 0x4000, 0x0000DEADBEEF00B4, 0x0 # 64-bit write: success
.8byte CLINT_BASE + 0x4000, 0x0000DEADBEEF00B4, 0x1 # 64-bit read: success
.8byte CLINT_BASE + 0x4000, 0x0000DEADBEEF00B5, 0x11 # 32-bit write: success
.8byte CLINT_BASE + 0x4000, 0x0000DEADBEEF00B5, 0x14 # 32-bit read: success
.8byte CLINT_BASE + 0x4000, 0x0000DEADBEEF00B6, 0x12 # 16-bit write: success
.8byte CLINT_BASE + 0x4000, 0x0000DEADBEEF00B6, 0x15 # 16-bit read: success
.8byte CLINT_BASE + 0x4000, 0x0000DEADBEEF00B7, 0x13 # 08-bit write: success
.8byte CLINT_BASE + 0x4000, 0x0000DEADBEEF00B7, 0x16 # 08-bit read: success
.8byte CLINT_BASE, 0xbad, 0x2 # execute: instruction access fault
.8byte CLINT_BASE, 0xbad, 0x2 # execute: instruction access fault
# ----------------- PLIC ---------------------
.8byte PLIC_BASE, 0x0000DEADBEEF00B8, 0x0 # 64-bit write: store access fault
.8byte PLIC_BASE, 0x0000DEADBEEF00B8, 0x1 # 64-bit read: load access fault
.8byte PLIC_BASE, 0x0000DEADBEEF00B9, 0x11 # 32-bit write: success
.8byte PLIC_BASE, 0x0000DEADBEEF00B9, 0x14 # 32-bit read: success
.8byte PLIC_BASE, 0x0000DEADBEEF00BA, 0x12 # 16-bit write: store access fault
.8byte PLIC_BASE, 0x0000DEADBEEF00BA, 0x15 # 16-bit read: load access fault
.8byte PLIC_BASE, 0x0000DEADBEEF00BB, 0x13 # 08-bit write: store access fault
.8byte PLIC_BASE, 0x0000DEADBEEF00BB, 0x16 # 08-bit read: load access fault
.8byte PLIC_BASE, 0x0000DEADBEEF00B8, 0x0 # 64-bit write: store access fault
.8byte PLIC_BASE, 0x0000DEADBEEF00B8, 0x1 # 64-bit read: load access fault
# Write 0x2 instead of wider value to plic address because the register width might change.
.8byte PLIC_BASE + 0x2000, 0x2, 0x11 # 32-bit write: success
.8byte PLIC_BASE + 0x2000, 0x2, 0x14 # 32-bit read: success
.8byte PLIC_BASE, 0x0000DEADBEEF00BA, 0x12 # 16-bit write: store access fault
.8byte PLIC_BASE, 0x0000DEADBEEF00BA, 0x15 # 16-bit read: load access fault
.8byte PLIC_BASE, 0x0000DEADBEEF00BB, 0x13 # 08-bit write: store access fault
.8byte PLIC_BASE, 0x0000DEADBEEF00BB, 0x16 # 08-bit read: load access fault
.8byte PLIC_BASE, 0xbad, 0x2 # execute: instruction access fault
.8byte PLIC_BASE, 0xbad, 0x2 # execute: instruction access fault
# ----------------- UART0 ---------------------
.8byte UART_BASE, 0x0000DEADBEEF00BC, 0x0 # 64-bit write: store access fault
.8byte UART_BASE, 0x0000DEADBEEF00BC, 0x1 # 64-bit read: load access fault
.8byte UART_BASE, 0x0000DEADBEEF00BD, 0x11 # 32-bit write: store access fault
.8byte UART_BASE, 0x0000DEADBEEF00BD, 0x14 # 32-bit read: load access fault
.8byte UART_BASE, 0x0000DEADBEEF00BE, 0x12 # 16-bit write: store access fault
.8byte UART_BASE, 0x0000DEADBEEF00BE, 0x15 # 16-bit read: load access fault
.8byte UART_BASE, 0x0000DEADBEEF00BF, 0x13 # 08-bit write: success
.8byte UART_BASE, 0x0000DEADBEEF00BF, 0x16 # 08-bit read: success
.8byte UART_BASE, 0x0000DEADBEEF00BC, 0x0 # 64-bit write: store access fault
.8byte UART_BASE, 0x0000DEADBEEF00BC, 0x1 # 64-bit read: load access fault
.8byte UART_BASE, 0x0000DEADBEEF00BD, 0x11 # 32-bit write: store access fault
.8byte UART_BASE, 0x0000DEADBEEF00BD, 0x14 # 32-bit read: load access fault
.8byte UART_BASE, 0x0000DEADBEEF00BE, 0x12 # 16-bit write: store access fault
.8byte UART_BASE, 0x0000DEADBEEF00BE, 0x15 # 16-bit read: load access fault
# Different address for this test so that we write into a writable register in the uart.
.8byte UART_BASE + 0x3, 0x0000DEADBEEF00BF, 0x13 # 08-bit write: success
.8byte UART_BASE + 0x3, 0x0000DEADBEEF00BF, 0x16 # 08-bit read: success
.8byte UART_BASE, 0xbad, 0x2 # execute: instruction access fault
.8byte UART_BASE, 0xbad, 0x2 # execute: instruction access fault
# ----------------- GPIO ---------------------
.8byte GPIO_BASE, 0x0000DEADBEEF00C0, 0x0 # 64-bit write: store access fault
.8byte GPIO_BASE, 0x0000DEADBEEF00C0, 0x1 # 64-bit read: load access fault
.8byte GPIO_BASE, 0x0000DEADBEEF00C1, 0x11 # 32-bit write: success
.8byte GPIO_BASE, 0x0000DEADBEEF00C1, 0x14 # 32-bit read: success
.8byte GPIO_BASE, 0x0000DEADBEEF00C2, 0x12 # 16-bit write: store access fault
.8byte GPIO_BASE, 0x0000DEADBEEF00C2, 0x15 # 16-bit read: load access fault
.8byte GPIO_BASE, 0x0000DEADBEEF00C3, 0x13 # 08-bit write: store access fault
.8byte GPIO_BASE, 0x0000DEADBEEF00C3, 0x16 # 08-bit read: load access fault
.8byte GPIO_BASE, 0x0000DEADBEEF00C0, 0x0 # 64-bit write: store access fault
.8byte GPIO_BASE, 0x0000DEADBEEF00C0, 0x1 # 64-bit read: load access fault
.8byte GPIO_BASE + 0x8, 0x0000DEADBEEF00C1, 0x11 # 32-bit write: success
.8byte GPIO_BASE + 0x8, 0x0000DEADBEEF00C1, 0x14 # 32-bit read: success
.8byte GPIO_BASE, 0x0000DEADBEEF00C2, 0x12 # 16-bit write: store access fault
.8byte GPIO_BASE, 0x0000DEADBEEF00C2, 0x15 # 16-bit read: load access fault
.8byte GPIO_BASE, 0x0000DEADBEEF00C3, 0x13 # 08-bit write: store access fault
.8byte GPIO_BASE, 0x0000DEADBEEF00C3, 0x16 # 08-bit read: load access fault
.8byte GPIO_BASE, 0xbad, 0x2 # execute: instruction access fault
.8byte GPIO_BASE, 0xbad, 0x2 # execute: instruction access fault
# ----------------- Inaccessible ---------------------
# show that load, store, and jalr cause faults in a region not defined by PMAs.
# *** should I go through every possible inaccessible region of memory or is one just fine?
# show that load, store, and jalr cause faults in regions not defined by PMAs.
# Tests 'random' place in unimplemented memory
.8byte 0xD000000, 0x0000DEADBEEF00C7, 0x0 # 64-bit write: store access fault
.8byte 0xD000000, 0x0000DEADBEEF00C7, 0x1 # 64-bit read: load access fault
.8byte 0xD000000, 0x111, 0x2 # execute: instruction access fault
.8byte 0x40000000, 0x0000DEADBEEF00C7, 0x0 # 64-bit write: store access fault
.8byte 0x40000000, 0x0000DEADBEEF00C7, 0x1 # 64-bit read: load access fault
.8byte 0x40000000, 0x111, 0x2 # execute: instruction access fault
# Tests just past the end of each peripheral
.8byte (BOOTROM_BASE+BOOTROM_RANGE+1), 0x0000DEADBEEF00C7, 0x0 # 64-bit write: store access fault
.8byte (BOOTROM_BASE+BOOTROM_RANGE+1), 0x0000DEADBEEF00C7, 0x1 # 64-bit read: load access fault
.8byte (BOOTROM_BASE+BOOTROM_RANGE+1), 0x0000DEADBEEF00C8, 0x0 # 64-bit write: store access fault
.8byte (BOOTROM_BASE+BOOTROM_RANGE+1), 0x0000DEADBEEF00C8, 0x1 # 64-bit read: load access fault
.8byte (BOOTROM_BASE+BOOTROM_RANGE+1), 0x111, 0x2 # execute: instruction access fault
.8byte (CLINT_BASE+CLINT_RANGE+1), 0x0000DEADBEEF00C7, 0x0 # 64-bit write: store access fault
.8byte (CLINT_BASE+CLINT_RANGE+1), 0x0000DEADBEEF00C7, 0x1 # 64-bit read: load access fault
.8byte (CLINT_BASE+CLINT_RANGE+1), 0x0000DEADBEEF00C9, 0x0 # 64-bit write: store access fault
.8byte (CLINT_BASE+CLINT_RANGE+1), 0x0000DEADBEEF00C9, 0x1 # 64-bit read: load access fault
.8byte (CLINT_BASE+CLINT_RANGE+1), 0x111, 0x2 # execute: instruction access fault
.8byte (PLIC_BASE+PLIC_RANGE+1), 0x0000DEADBEEF00C7, 0x11 # 32-bit write: store access fault
.8byte (PLIC_BASE+PLIC_RANGE+1), 0x0000DEADBEEF00C7, 0x14 # 32-bit read: load access fault
.8byte (PLIC_BASE+PLIC_RANGE+1), 0x0000DEADBEEF00CA, 0x11 # 32-bit write: store access fault
.8byte (PLIC_BASE+PLIC_RANGE+1), 0x0000DEADBEEF00CA, 0x14 # 32-bit read: load access fault
.8byte (PLIC_BASE+PLIC_RANGE+1), 0x111, 0x2 # execute: instruction access fault
.8byte (UART_BASE+UART_RANGE+1), 0x0000DEADBEEF00C7, 0x13 # 08-bit write: store access fault
.8byte (UART_BASE+UART_RANGE+1), 0x0000DEADBEEF00C7, 0x16 # 08-bit read: load access fault
.8byte (UART_BASE+UART_RANGE+1), 0x0000DEADBEEF00CB, 0x13 # 08-bit write: store access fault
.8byte (UART_BASE+UART_RANGE+1), 0x0000DEADBEEF00CB, 0x16 # 08-bit read: load access fault
.8byte (UART_BASE+UART_RANGE+1), 0x111, 0x2 # execute: instruction access fault
.8byte (GPIO_BASE+GPIO_RANGE+1), 0x0000DEADBEEF00C7, 0x11 # 32-bit write: store access fault
.8byte (GPIO_BASE+GPIO_RANGE+1), 0x0000DEADBEEF00C7, 0x14 # 32-bit read: load access fault
.8byte (GPIO_BASE+GPIO_RANGE+1), 0x0000DEADBEEF00CC, 0x11 # 32-bit write: store access fault
.8byte (GPIO_BASE+GPIO_RANGE+1), 0x0000DEADBEEF00CC, 0x14 # 32-bit read: load access fault
.8byte (GPIO_BASE+GPIO_RANGE+1), 0x111, 0x2 # execute: instruction access fault
.8byte 0x0, 0x0, 0x3 // terminate tests
.8byte 0x0, 0x0, 0x3 # terminate tests