Merge branch 'main' into dcache

2025-02-11 06:05:49 +00:00 · 2021-07-15 11:55:20 -05:00 · 2021-07-15 11:55:20 -05:00 · 4549a9f1c9
commit 4549a9f1c9
parent 5fb5ac3d5a 701ea38964
185 changed files with 28127 additions and 10721 deletions
--- a/.gitignore
+++ b/.gitignore
@ -28,5 +28,12 @@ wally-pipelined/linux-testgen/nohup*
 wally-pipelined/linux-testgen/x*
 !wally-pipelined/linux-testgen/linux-testvectors/tvCopier.py
 !wally-pipelined/linux-testgen/linux-testvectors/tvLinker.sh
+!wally-pipelined/linux-testgen/linux-testvectors/tvUnlinker.sh
+!wally-pipelined/linux-testgen/linux-testvectors/intermediate-outputs
+wally-pipelined/linux-testgen/linux-testvectors/intermediate-outputs/*
+!wally-pipelined/linux-testgen/linux-testvectors/intermediate-outputs/git_create_dir.txt
+wally-pipelined/linux-testgen/buildroot/
+wally-pipelined/linux-testgen/buildroot-image-output
+wally-pipelined/linux-testgen/buildroot-config-src/main.config.old
 wally-pipelined/regression/slack-notifier/slack-webhook-url.txt

--- a/riscv-coremark/coremark/README.md
+++ b/riscv-coremark/coremark/README.md
@ -3,6 +3,8 @@

 CoreMark's primary goals are simplicity and providing a method for testing only a processor's core features. For more information about EEMBC's comprehensive embedded benchmark suites, please see www.eembc.org.

+For a more compute-intensive version of CoreMark that uses larger datasets and execution loops taken from common applications, please check out EEMBC's [CoreMark-PRO](https://www.github.com/eembc/coremark-pro) benchmark, also on GitHub.
+
 # Building and Running
 	
 To build and run the benchmark, type 
@ -83,7 +85,9 @@ Use `XCFLAGS=-DMULTITHREAD=N` where N is number of threads to run in parallel. S
 % make XCFLAGS="-DMULTITHREAD=4 -DUSE_PTHREAD"
 ~~~

-Above will compile the benchmark for execution on 4 cores, using POSIX Threads API.
+The above will compile the benchmark for execution on 4 cores, using POSIX Threads API.
+
+Note: linking may fail on the previous command if your linker does not automatically add the `pthread` library. If you encounter `undefined reference` errors, please modify the `core_portme.mak` file for your platform, (e.g. `linux/core_portme.mak`) and add `-lpthread` to the `LFLAGS_END` parameter.

 # Run Parameters for the Benchmark Executable
 CoreMark's executable takes several parameters as follows (but only if `main()` accepts arguments):
@ -109,7 +113,7 @@ The default for such a target when testing different configurations could be:

 # Submitting Results

-CoreMark results can be submitted on the web. Open a web browser and go to https://www.eembc.org/coremark/login.php?url=enter_score.php. After registering an account you may enter a score.
+CoreMark results can be submitted on the web. Open a web browser and go to the [submission page](https://www.eembc.org/coremark/submit.php). After registering an account you may enter a score.

 # Run Rules
 What is and is not allowed.
--- a/riscv-coremark/coremark/barebones/core_portme.c
+++ b/riscv-coremark/coremark/barebones/core_portme.c
@ -19,110 +19,135 @@ Original Author: Shay Gal-on
 #include "core_portme.h"

 #if VALIDATION_RUN
-	volatile ee_s32 seed1_volatile=0x3415;
-	volatile ee_s32 seed2_volatile=0x3415;
-	volatile ee_s32 seed3_volatile=0x66;
+volatile ee_s32 seed1_volatile = 0x3415;
+volatile ee_s32 seed2_volatile = 0x3415;
+volatile ee_s32 seed3_volatile = 0x66;
 #endif
 #if PERFORMANCE_RUN
-	volatile ee_s32 seed1_volatile=0x0;
-	volatile ee_s32 seed2_volatile=0x0;
-	volatile ee_s32 seed3_volatile=0x66;
+volatile ee_s32 seed1_volatile = 0x0;
+volatile ee_s32 seed2_volatile = 0x0;
+volatile ee_s32 seed3_volatile = 0x66;
 #endif
 #if PROFILE_RUN
-	volatile ee_s32 seed1_volatile=0x8;
-	volatile ee_s32 seed2_volatile=0x8;
-	volatile ee_s32 seed3_volatile=0x8;
+volatile ee_s32 seed1_volatile = 0x8;
+volatile ee_s32 seed2_volatile = 0x8;
+volatile ee_s32 seed3_volatile = 0x8;
 #endif
-	volatile ee_s32 seed4_volatile=ITERATIONS;
-	volatile ee_s32 seed5_volatile=0;
+volatile ee_s32 seed4_volatile = ITERATIONS;
+volatile ee_s32 seed5_volatile = 0;
 /* Porting : Timing functions
-	How to capture time and convert to seconds must be ported to whatever is supported by the platform.
-	e.g. Read value from on board RTC, read value from cpu clock cycles performance counter etc. 
-	Sample implementation for standard time.h and windows.h definitions included.
+        How to capture time and convert to seconds must be ported to whatever is
+   supported by the platform. e.g. Read value from on board RTC, read value from
+   cpu clock cycles performance counter etc. Sample implementation for standard
+   time.h and windows.h definitions included.
 */
-CORETIMETYPE barebones_clock() {
-	#error "You must implement a method to measure time in barebones_clock()! This function should return current time.\n"
+CORETIMETYPE
+barebones_clock()
+{
+#error \
+    "You must implement a method to measure time in barebones_clock()! This function should return current time.\n"
 }
 /* Define : TIMER_RES_DIVIDER
-	Divider to trade off timer resolution and total time that can be measured.
+        Divider to trade off timer resolution and total time that can be
+   measured.

-	Use lower values to increase resolution, but make sure that overflow does not occur.
-	If there are issues with the return value overflowing, increase this value.
-	*/
-#define GETMYTIME(_t) (*_t=barebones_clock())
-#define MYTIMEDIFF(fin,ini) ((fin)-(ini))
-#define TIMER_RES_DIVIDER 1
+        Use lower values to increase resolution, but make sure that overflow
+   does not occur. If there are issues with the return value overflowing,
+   increase this value.
+        */
+#define GETMYTIME(_t)              (*_t = barebones_clock())
+#define MYTIMEDIFF(fin, ini)       ((fin) - (ini))
+#define TIMER_RES_DIVIDER          1
 #define SAMPLE_TIME_IMPLEMENTATION 1
-#define EE_TICKS_PER_SEC (CLOCKS_PER_SEC / TIMER_RES_DIVIDER)
+#define EE_TICKS_PER_SEC           (CLOCKS_PER_SEC / TIMER_RES_DIVIDER)

 /** Define Host specific (POSIX), or target specific global time variables. */
 static CORETIMETYPE start_time_val, stop_time_val;

 /* Function : start_time
-	This function will be called right before starting the timed portion of the benchmark.
+        This function will be called right before starting the timed portion of
+   the benchmark.

-	Implementation may be capturing a system timer (as implemented in the example code) 
-	or zeroing some system parameters - e.g. setting the cpu clocks cycles to 0.
+        Implementation may be capturing a system timer (as implemented in the
+   example code) or zeroing some system parameters - e.g. setting the cpu clocks
+   cycles to 0.
 */
-void start_time(void) {
-	GETMYTIME(&start_time_val );      
+void
+start_time(void)
+{
+    GETMYTIME(&start_time_val);
 }
 /* Function : stop_time
-	This function will be called right after ending the timed portion of the benchmark.
+        This function will be called right after ending the timed portion of the
+   benchmark.

-	Implementation may be capturing a system timer (as implemented in the example code) 
-	or other system parameters - e.g. reading the current value of cpu cycles counter.
+        Implementation may be capturing a system timer (as implemented in the
+   example code) or other system parameters - e.g. reading the current value of
+   cpu cycles counter.
 */
-void stop_time(void) {
-	GETMYTIME(&stop_time_val );      
+void
+stop_time(void)
+{
+    GETMYTIME(&stop_time_val);
 }
 /* Function : get_time
-	Return an abstract "ticks" number that signifies time on the system.
-	
-	Actual value returned may be cpu cycles, milliseconds or any other value,
-	as long as it can be converted to seconds by <time_in_secs>.
-	This methodology is taken to accomodate any hardware or simulated platform.
-	The sample implementation returns millisecs by default, 
-	and the resolution is controlled by <TIMER_RES_DIVIDER>
+        Return an abstract "ticks" number that signifies time on the system.
+
+        Actual value returned may be cpu cycles, milliseconds or any other
+   value, as long as it can be converted to seconds by <time_in_secs>. This
+   methodology is taken to accomodate any hardware or simulated platform. The
+   sample implementation returns millisecs by default, and the resolution is
+   controlled by <TIMER_RES_DIVIDER>
 */
-CORE_TICKS get_time(void) {
-	CORE_TICKS elapsed=(CORE_TICKS)(MYTIMEDIFF(stop_time_val, start_time_val));
-	return elapsed;
+CORE_TICKS
+get_time(void)
+{
+    CORE_TICKS elapsed
+        = (CORE_TICKS)(MYTIMEDIFF(stop_time_val, start_time_val));
+    return elapsed;
 }
 /* Function : time_in_secs
-	Convert the value returned by get_time to seconds.
+        Convert the value returned by get_time to seconds.

-	The <secs_ret> type is used to accomodate systems with no support for floating point.
-	Default implementation implemented by the EE_TICKS_PER_SEC macro above.
+        The <secs_ret> type is used to accomodate systems with no support for
+   floating point. Default implementation implemented by the EE_TICKS_PER_SEC
+   macro above.
 */
-secs_ret time_in_secs(CORE_TICKS ticks) {
-	secs_ret retval=((secs_ret)ticks) / (secs_ret)EE_TICKS_PER_SEC;
-	return retval;
+secs_ret
+time_in_secs(CORE_TICKS ticks)
+{
+    secs_ret retval = ((secs_ret)ticks) / (secs_ret)EE_TICKS_PER_SEC;
+    return retval;
 }

-ee_u32 default_num_contexts=1;
+ee_u32 default_num_contexts = 1;

 /* Function : portable_init
-	Target specific initialization code 
-	Test for some common mistakes.
+        Target specific initialization code
+        Test for some common mistakes.
 */
-void portable_init(core_portable *p, int *argc, char *argv[])
+void
+portable_init(core_portable *p, int *argc, char *argv[])
 {
-	#error "Call board initialization routines in portable init (if needed), in particular initialize UART!\n"
-	if (sizeof(ee_ptr_int) != sizeof(ee_u8 *)) {
-		ee_printf("ERROR! Please define ee_ptr_int to a type that holds a pointer!\n");
-	}
-	if (sizeof(ee_u32) != 4) {
-		ee_printf("ERROR! Please define ee_u32 to a 32b unsigned type!\n");
-	}
-	p->portable_id=1;
+#error \
+    "Call board initialization routines in portable init (if needed), in particular initialize UART!\n"
+    if (sizeof(ee_ptr_int) != sizeof(ee_u8 *))
+    {
+        ee_printf(
+            "ERROR! Please define ee_ptr_int to a type that holds a "
+            "pointer!\n");
+    }
+    if (sizeof(ee_u32) != 4)
+    {
+        ee_printf("ERROR! Please define ee_u32 to a 32b unsigned type!\n");
+    }
+    p->portable_id = 1;
 }
 /* Function : portable_fini
-	Target specific final code 
+        Target specific final code
 */
-void portable_fini(core_portable *p)
+void
+portable_fini(core_portable *p)
 {
-	p->portable_id=0;
+    p->portable_id = 0;
 }
-
-
--- a/riscv-coremark/coremark/barebones/core_portme.h
+++ b/riscv-coremark/coremark/barebones/core_portme.h
@ -16,178 +16,189 @@ limitations under the License.
 Original Author: Shay Gal-on
 */
 /* Topic : Description
-	This file contains configuration constants required to execute on different platforms
+        This file contains configuration constants required to execute on
+   different platforms
 */
 #ifndef CORE_PORTME_H
 #define CORE_PORTME_H
 /************************/
 /* Data types and settings */
 /************************/
-/* Configuration : HAS_FLOAT 
-	Define to 1 if the platform supports floating point.
+/* Configuration : HAS_FLOAT
+        Define to 1 if the platform supports floating point.
 */
-#ifndef HAS_FLOAT 
+#ifndef HAS_FLOAT
 #define HAS_FLOAT 1
 #endif
 /* Configuration : HAS_TIME_H
-	Define to 1 if platform has the time.h header file,
-	and implementation of functions thereof.
+        Define to 1 if platform has the time.h header file,
+        and implementation of functions thereof.
 */
 #ifndef HAS_TIME_H
 #define HAS_TIME_H 1
 #endif
 /* Configuration : USE_CLOCK
-	Define to 1 if platform has the time.h header file,
-	and implementation of functions thereof.
+        Define to 1 if platform has the time.h header file,
+        and implementation of functions thereof.
 */
 #ifndef USE_CLOCK
 #define USE_CLOCK 1
 #endif
 /* Configuration : HAS_STDIO
-	Define to 1 if the platform has stdio.h.
+        Define to 1 if the platform has stdio.h.
 */
 #ifndef HAS_STDIO
 #define HAS_STDIO 0
 #endif
 /* Configuration : HAS_PRINTF
-	Define to 1 if the platform has stdio.h and implements the printf function.
+        Define to 1 if the platform has stdio.h and implements the printf
+   function.
 */
 #ifndef HAS_PRINTF
 #define HAS_PRINTF 0
 #endif

-
 /* Definitions : COMPILER_VERSION, COMPILER_FLAGS, MEM_LOCATION
-	Initialize these strings per platform
+        Initialize these strings per platform
 */
-#ifndef COMPILER_VERSION 
- #ifdef __GNUC__
- #define COMPILER_VERSION "GCC"__VERSION__
- #else
- #define COMPILER_VERSION "Please put compiler version here (e.g. gcc 4.1)"
- #endif
+#ifndef COMPILER_VERSION
+#ifdef __GNUC__
+#define COMPILER_VERSION "GCC"__VERSION__
+#else
+#define COMPILER_VERSION "Please put compiler version here (e.g. gcc 4.1)"
 #endif
-#ifndef COMPILER_FLAGS 
- #define COMPILER_FLAGS FLAGS_STR /* "Please put compiler flags here (e.g. -o3)" */
 #endif
-#ifndef MEM_LOCATION 
- #define MEM_LOCATION "STACK"
+#ifndef COMPILER_FLAGS
+#define COMPILER_FLAGS \
+    FLAGS_STR /* "Please put compiler flags here (e.g. -o3)" */
+#endif
+#ifndef MEM_LOCATION
+#define MEM_LOCATION "STACK"
 #endif

 /* Data Types :
-	To avoid compiler issues, define the data types that need ot be used for 8b, 16b and 32b in <core_portme.h>.
-	
-	*Imprtant* :
-	ee_ptr_int needs to be the data type used to hold pointers, otherwise coremark may fail!!!
+        To avoid compiler issues, define the data types that need ot be used for
+   8b, 16b and 32b in <core_portme.h>.
+
+        *Imprtant* :
+        ee_ptr_int needs to be the data type used to hold pointers, otherwise
+   coremark may fail!!!
 */
-typedef signed short ee_s16;
+typedef signed short   ee_s16;
 typedef unsigned short ee_u16;
-typedef signed int ee_s32;
-typedef double ee_f32;
-typedef unsigned char ee_u8;
-typedef unsigned int ee_u32;
-typedef ee_u32 ee_ptr_int;
-typedef size_t ee_size_t;
+typedef signed int     ee_s32;
+typedef double         ee_f32;
+typedef unsigned char  ee_u8;
+typedef unsigned int   ee_u32;
+typedef ee_u32         ee_ptr_int;
+typedef size_t         ee_size_t;
 #define NULL ((void *)0)
 /* align_mem :
-	This macro is used to align an offset to point to a 32b value. It is used in the Matrix algorithm to initialize the input memory blocks.
+        This macro is used to align an offset to point to a 32b value. It is
+   used in the Matrix algorithm to initialize the input memory blocks.
 */
-#define align_mem(x) (void *)(4 + (((ee_ptr_int)(x) - 1) & ~3))
+#define align_mem(x) (void *)(4 + (((ee_ptr_int)(x)-1) & ~3))

 /* Configuration : CORE_TICKS
-	Define type of return from the timing functions.
+        Define type of return from the timing functions.
 */
-#define CORETIMETYPE ee_u32 
+#define CORETIMETYPE ee_u32
 typedef ee_u32 CORE_TICKS;

 /* Configuration : SEED_METHOD
-	Defines method to get seed values that cannot be computed at compile time.
-	
-	Valid values :
-	SEED_ARG - from command line.
-	SEED_FUNC - from a system function.
-	SEED_VOLATILE - from volatile variables.
+        Defines method to get seed values that cannot be computed at compile
+   time.
+
+        Valid values :
+        SEED_ARG - from command line.
+        SEED_FUNC - from a system function.
+        SEED_VOLATILE - from volatile variables.
 */
 #ifndef SEED_METHOD
 #define SEED_METHOD SEED_VOLATILE
 #endif

 /* Configuration : MEM_METHOD
-	Defines method to get a block of memry.
-	
-	Valid values :
-	MEM_MALLOC - for platforms that implement malloc and have malloc.h.
-	MEM_STATIC - to use a static memory array.
-	MEM_STACK - to allocate the data block on the stack (NYI).
+        Defines method to get a block of memry.
+
+        Valid values :
+        MEM_MALLOC - for platforms that implement malloc and have malloc.h.
+        MEM_STATIC - to use a static memory array.
+        MEM_STACK - to allocate the data block on the stack (NYI).
 */
 #ifndef MEM_METHOD
 #define MEM_METHOD MEM_STACK
 #endif

 /* Configuration : MULTITHREAD
-	Define for parallel execution 
-	
-	Valid values :
-	1 - only one context (default).
-	N>1 - will execute N copies in parallel.
-	
-	Note : 
-	If this flag is defined to more then 1, an implementation for launching parallel contexts must be defined.
-	
-	Two sample implementations are provided. Use <USE_PTHREAD> or <USE_FORK> to enable them.
-	
-	It is valid to have a different implementation of <core_start_parallel> and <core_end_parallel> in <core_portme.c>,
-	to fit a particular architecture. 
+        Define for parallel execution
+
+        Valid values :
+        1 - only one context (default).
+        N>1 - will execute N copies in parallel.
+
+        Note :
+        If this flag is defined to more then 1, an implementation for launching
+   parallel contexts must be defined.
+
+        Two sample implementations are provided. Use <USE_PTHREAD> or <USE_FORK>
+   to enable them.
+
+        It is valid to have a different implementation of <core_start_parallel>
+   and <core_end_parallel> in <core_portme.c>, to fit a particular architecture.
 */
 #ifndef MULTITHREAD
 #define MULTITHREAD 1
 #define USE_PTHREAD 0
-#define USE_FORK 0
-#define USE_SOCKET 0
+#define USE_FORK    0
+#define USE_SOCKET  0
 #endif

 /* Configuration : MAIN_HAS_NOARGC
-	Needed if platform does not support getting arguments to main. 
-	
-	Valid values :
-	0 - argc/argv to main is supported
-	1 - argc/argv to main is not supported
-	
-	Note : 
-	This flag only matters if MULTITHREAD has been defined to a value greater then 1.
+        Needed if platform does not support getting arguments to main.
+
+        Valid values :
+        0 - argc/argv to main is supported
+        1 - argc/argv to main is not supported
+
+        Note :
+        This flag only matters if MULTITHREAD has been defined to a value
+   greater then 1.
 */
-#ifndef MAIN_HAS_NOARGC 
+#ifndef MAIN_HAS_NOARGC
 #define MAIN_HAS_NOARGC 0
 #endif

 /* Configuration : MAIN_HAS_NORETURN
-	Needed if platform does not support returning a value from main. 
-	
-	Valid values :
-	0 - main returns an int, and return value will be 0.
-	1 - platform does not support returning a value from main
+        Needed if platform does not support returning a value from main.
+
+        Valid values :
+        0 - main returns an int, and return value will be 0.
+        1 - platform does not support returning a value from main
 */
 #ifndef MAIN_HAS_NORETURN
 #define MAIN_HAS_NORETURN 0
 #endif

 /* Variable : default_num_contexts
-	Not used for this simple port, must cintain the value 1.
+        Not used for this simple port, must cintain the value 1.
 */
 extern ee_u32 default_num_contexts;

-typedef struct CORE_PORTABLE_S {
-	ee_u8	portable_id;
+typedef struct CORE_PORTABLE_S
+{
+    ee_u8 portable_id;
 } core_portable;

 /* target specific init/fini */
 void portable_init(core_portable *p, int *argc, char *argv[]);
 void portable_fini(core_portable *p);

-#if !defined(PROFILE_RUN) && !defined(PERFORMANCE_RUN) && !defined(VALIDATION_RUN)
-#if (TOTAL_DATA_SIZE==1200)
+#if !defined(PROFILE_RUN) && !defined(PERFORMANCE_RUN) \
+    && !defined(VALIDATION_RUN)
+#if (TOTAL_DATA_SIZE == 1200)
 #define PROFILE_RUN 1
-#elif (TOTAL_DATA_SIZE==2000)
+#elif (TOTAL_DATA_SIZE == 2000)
 #define PERFORMANCE_RUN 1
 #else
 #define VALIDATION_RUN 1
--- a/riscv-coremark/coremark/barebones/cvt.c
+++ b/riscv-coremark/coremark/barebones/cvt.c
@ -17,101 +17,111 @@ limitations under the License.
 #define CVTBUFSIZE 80
 static char CVTBUF[CVTBUFSIZE];

-static char *cvt(double arg, int ndigits, int *decpt, int *sign, char *buf, int eflag)
+static char *
+cvt(double arg, int ndigits, int *decpt, int *sign, char *buf, int eflag)
 {
-  int r2;
-  double fi, fj;
-  char *p, *p1;
+    int    r2;
+    double fi, fj;
+    char * p, *p1;

-  if (ndigits < 0) ndigits = 0;
-  if (ndigits >= CVTBUFSIZE - 1) ndigits = CVTBUFSIZE - 2;
-  r2 = 0;
-  *sign = 0;
-  p = &buf[0];
-  if (arg < 0)
-  {
-    *sign = 1;
-    arg = -arg;
-  }
-  arg = modf(arg, &fi);
-  p1 = &buf[CVTBUFSIZE];
+    if (ndigits < 0)
+        ndigits = 0;
+    if (ndigits >= CVTBUFSIZE - 1)
+        ndigits = CVTBUFSIZE - 2;
+    r2    = 0;
+    *sign = 0;
+    p     = &buf[0];
+    if (arg < 0)
+    {
+        *sign = 1;
+        arg   = -arg;
+    }
+    arg = modf(arg, &fi);
+    p1  = &buf[CVTBUFSIZE];

-  if (fi != 0) 
-  {
-    p1 = &buf[CVTBUFSIZE];
-    while (fi != 0) 
+    if (fi != 0)
    {
-      fj = modf(fi / 10, &fi);
-      *--p1 = (int)((fj + .03) * 10) + '0';
-      r2++;
+        p1 = &buf[CVTBUFSIZE];
+        while (fi != 0)
+        {
+            fj    = modf(fi / 10, &fi);
+            *--p1 = (int)((fj + .03) * 10) + '0';
+            r2++;
+        }
+        while (p1 < &buf[CVTBUFSIZE])
+            *p++ = *p1++;
    }
-    while (p1 < &buf[CVTBUFSIZE]) *p++ = *p1++;
-  } 
-  else if (arg > 0)
-  {
-    while ((fj = arg * 10) < 1) 
+    else if (arg > 0)
    {
-      arg = fj;
-      r2--;
+        while ((fj = arg * 10) < 1)
+        {
+            arg = fj;
+            r2--;
+        }
    }
-  }
-  p1 = &buf[ndigits];
-  if (eflag == 0) p1 += r2;
-  *decpt = r2;
-  if (p1 < &buf[0]) 
-  {
-    buf[0] = '\0';
+    p1 = &buf[ndigits];
+    if (eflag == 0)
+        p1 += r2;
+    *decpt = r2;
+    if (p1 < &buf[0])
+    {
+        buf[0] = '\0';
+        return buf;
+    }
+    while (p <= p1 && p < &buf[CVTBUFSIZE])
+    {
+        arg *= 10;
+        arg  = modf(arg, &fj);
+        *p++ = (int)fj + '0';
+    }
+    if (p1 >= &buf[CVTBUFSIZE])
+    {
+        buf[CVTBUFSIZE - 1] = '\0';
+        return buf;
+    }
+    p = p1;
+    *p1 += 5;
+    while (*p1 > '9')
+    {
+        *p1 = '0';
+        if (p1 > buf)
+            ++*--p1;
+        else
+        {
+            *p1 = '1';
+            (*decpt)++;
+            if (eflag == 0)
+            {
+                if (p > buf)
+                    *p = '0';
+                p++;
+            }
+        }
+    }
+    *p = '\0';
    return buf;
-  }
-  while (p <= p1 && p < &buf[CVTBUFSIZE])
-  {
-    arg *= 10;
-    arg = modf(arg, &fj);
-    *p++ = (int) fj + '0';
-  }
-  if (p1 >= &buf[CVTBUFSIZE]) 
-  {
-    buf[CVTBUFSIZE - 1] = '\0';
-    return buf;
-  }
-  p = p1;
-  *p1 += 5;
-  while (*p1 > '9') 
-  {
-    *p1 = '0';
-    if (p1 > buf)
-      ++*--p1;
-    else 
-    {
-      *p1 = '1';
-      (*decpt)++;
-      if (eflag == 0) 
-      {
-        if (p > buf) *p = '0';
-        p++;
-      }
-    }
-  }
-  *p = '\0';
-  return buf;
 }

-char *ecvt(double arg, int ndigits, int *decpt, int *sign)
+char *
+ecvt(double arg, int ndigits, int *decpt, int *sign)
 {
-  return cvt(arg, ndigits, decpt, sign, CVTBUF, 1);
+    return cvt(arg, ndigits, decpt, sign, CVTBUF, 1);
 }

-char *ecvtbuf(double arg, int ndigits, int *decpt, int *sign, char *buf)
+char *
+ecvtbuf(double arg, int ndigits, int *decpt, int *sign, char *buf)
 {
-  return cvt(arg, ndigits, decpt, sign, buf, 1);
+    return cvt(arg, ndigits, decpt, sign, buf, 1);
 }

-char *fcvt(double arg, int ndigits, int *decpt, int *sign)
+char *
+fcvt(double arg, int ndigits, int *decpt, int *sign)
 {
-  return cvt(arg, ndigits, decpt, sign, CVTBUF, 0);
+    return cvt(arg, ndigits, decpt, sign, CVTBUF, 0);
 }

-char *fcvtbuf(double arg, int ndigits, int *decpt, int *sign, char *buf)
+char *
+fcvtbuf(double arg, int ndigits, int *decpt, int *sign, char *buf)
 {
-  return cvt(arg, ndigits, decpt, sign, buf, 0);
+    return cvt(arg, ndigits, decpt, sign, buf, 0);
 }
--- a/riscv-coremark/coremark/barebones/ee_printf.c
+++ b/riscv-coremark/coremark/barebones/ee_printf.c
--- a/riscv-coremark/coremark/core_list_join.c
+++ b/riscv-coremark/coremark/core_list_join.c
@ -17,8 +17,8 @@ Original Author: Shay Gal-on
 */

 #include "coremark.h"
-#include <stdlib.h>
-#include <string.h>
+//#include <stdlib.h>
+//#include <string.h>
 /*
 Topic: Description
 	Benchmark using a linked list.
@ -118,7 +118,7 @@ ee_s32 cmp_idx(list_data *a, list_data *b, core_results *res) {
 	return a->idx - b->idx;
 }

-void ehitoa(int value, char *str, int base){
+/*void ehitoa(int value, char *str, int base){
 	if (value>100000) strcpy(str,"too big");
 	else{
 		int places[6] = {100000, 10000, 1000, 100, 10, 1};
@ -135,7 +135,7 @@ void ehitoa(int value, char *str, int base){
 		}
 		str[6]=0;
 	}
-}
+}*/

 void copy_info(list_data *to,list_data *from) {
 	to->data16=from->data16;
@ -158,22 +158,22 @@ ee_u16 core_bench_list(core_results *res, ee_s16 finder_idx) {
 	list_head *finder, *remover;
 	list_data info;
 	ee_s16 i;
-	ee_printf("entered corebenchlist \n");
+	//ee_printf("entered corebenchlist \n");
 	info.idx=finder_idx;
 	/* find <find_num> values in the list, and change the list each time (reverse and cache if value found) */
 	for (i=0; i<find_num; i++) {
-		ee_printf("for loop \n");
+		//ee_printf("for loop \n");
 		info.data16= (i & 0xff) ;
 		this_find=core_list_find(list,&info);
 		list=core_list_reverse(list);
 		if (this_find==NULL) {
 			missed++;
 			retval+=(list->next->info->data16 >> 8) & 1;
-			ee_printf("if statement \n");
+			//ee_printf("if statement \n");
 		}
 		else {
 			found++;
-			ee_printf("else statement \n");
+			//ee_printf("else statement \n");
 			if (this_find->info->data16 & 0x1) /* use found value */
 				retval+=(this_find->info->data16 >> 9) & 1;
 			/* and cache next item at the head of the list (if any) */
@ -187,7 +187,7 @@ ee_u16 core_bench_list(core_results *res, ee_s16 finder_idx) {
 		if (info.idx>=0)
 			info.idx++;
 #if CORE_DEBUG
-	ee_printf("List find %d: [%d,%d,%d]\n",i,retval,missed,found);
+	//ee_printf("List find %d: [%d,%d,%d]\n",i,retval,missed,found);
 #endif
 	}
 	retval+=found*4-missed;
@ -204,7 +204,7 @@ ee_u16 core_bench_list(core_results *res, ee_s16 finder_idx) {
 		finder=finder->next;
 	}
 #if CORE_DEBUG
-	ee_printf("List sort 1: %04x\n",retval);
+	//ee_printf("List sort 1: %04x\n",retval);
 #endif
 	remover=core_list_undo_remove(remover,list->next);
 	/* sort the list by index, in effect returning the list to original state */
@ -216,7 +216,7 @@ ee_u16 core_bench_list(core_results *res, ee_s16 finder_idx) {
 		finder=finder->next;
 	}
 #if CORE_DEBUG
-	ee_printf("List sort 2: %04x\n",retval);
+	//ee_printf("List sort 2: %04x\n",retval);
 #endif
 	return retval;
 }
@ -235,26 +235,26 @@ ee_u16 core_bench_list(core_results *res, ee_s16 finder_idx) {
 */
 list_head *core_list_init(ee_u32 blksize, list_head *memblock, ee_s16 seed) {
 	/* calculated pointers for the list */
-	ee_printf("%d \n blksize", blksize);
+	//ee_printf("%d \n blksize", blksize);
 	ee_u32 per_item=16+sizeof(struct list_data_s);
-	ee_printf("%d \n sizeof", sizeof(struct list_data_s));
-	ee_printf("%d \n  per_item", per_item);
+	//ee_printf("%d \n sizeof", sizeof(struct list_data_s));
+	//ee_printf("%d \n  per_item", per_item);
 	ee_u32 size=(blksize/per_item)-2; 
-	char bufftwo[200];
-	ehitoa(size, bufftwo, 10);
-	ee_printf(" size = %s done \n", bufftwo);
-	ee_printf("%d", size);/* to accomodate systems with 64b pointers, and make sure same code is executed, set max list elements */
+	//char bufftwo[200];
+	//ehitoa(size, bufftwo, 10);
+	//ee_printf(" size = %s done \n", bufftwo);
+	//ee_printf("%d", size);/* to accomodate systems with 64b pointers, and make sure same code is executed, set max list elements */
 	list_head *memblock_end=memblock+size;

 	list_data *datablock=(list_data *)(memblock_end);
 	list_data *datablock_end=datablock+size;
-	ee_printf("datablock_end");
+	//ee_printf("datablock_end");
 	/* some useful variables */
 	ee_u32 i;
 	list_head *finder,*list=memblock;
 	list_data info;
-	ehitoa(size, bufftwo, 10);
-	ee_printf(" size2 = %s done \n", bufftwo);
+	//ehitoa(size, bufftwo, 10);
+	//ee_printf(" size2 = %s done \n", bufftwo);

 	/* create a fake items for the list head and tail */
 	list->next=NULL;
@ -265,58 +265,58 @@ list_head *core_list_init(ee_u32 blksize, list_head *memblock, ee_s16 seed) {
 	datablock++;
 	info.idx=0x7fff;
 	info.data16=(ee_s16)0xffff;
-	ehitoa(size, bufftwo, 10);
-	ee_printf(" size3 = %s done \n", bufftwo);
+	//ehitoa(size, bufftwo, 10);
+	//ee_printf(" size3 = %s done \n", bufftwo);
 	core_list_insert_new(list,&info,&memblock,&datablock,memblock_end,datablock_end);
-	ehitoa(size, bufftwo, 10);
-	ee_printf(" size4 = %s done \n", bufftwo);; 
+	//ehitoa(size, bufftwo, 10);
+	//ee_printf(" size4 = %s done \n", bufftwo);; 
 	/* then insert size items */
 	for (i=0; i<size; i++) {
 		ee_u16 datpat=((ee_u16)(seed^i) & 0xf);
 		ee_u16 dat=(datpat<<3) | (i&0x7); /* alternate between algorithms */
 		info.data16=(dat<<8) | dat;		/* fill the data with actual data and upper bits with rebuild value */
 		core_list_insert_new(list,&info,&memblock,&datablock,memblock_end,datablock_end);
-		ehitoa(i, bufftwo, 10);
-		ee_printf(" i = %s done \n", bufftwo);
+		//ehitoa(i, bufftwo, 10);
+		//ee_printf(" i = %s done \n", bufftwo);
 		//ee_printf("%d \n", i);
 		/*char grow[200];
 		char growtwo[200];
 		itoa(i, growtwo, 10);
 		sprintf(grow, "test %u buff2 %s goodbyeadd \n", i, growtwo);*/
 	}
-	ee_printf("exited for \n");
+	//ee_printf("exited for \n");
 	/* and now index the list so we know initial seed order of the list */
 	finder=list->next;
 	i=1;
-	ehitoa(i, bufftwo, 10);
-	ee_printf(" i = %s done \n", bufftwo);
+	//ehitoa(i, bufftwo, 10);
+	//ee_printf(" i = %s done \n", bufftwo);
 	while (finder->next!=NULL) {
-		ee_printf("enter while statement \n");
+		//ee_printf("enter while statement \n");
 		if (i<size/5){ /* first 20% of the list in order */
 			finder->info->idx=i++;
-			ehitoa(i, bufftwo, 10);
-			ee_printf("  if i = %s done \n", bufftwo);
+			//ehitoa(i, bufftwo, 10);
+			//ee_printf("  if i = %s done \n", bufftwo);
 		}
 		
 		else { 
 			ee_u16 pat=(ee_u16)(i++ ^ seed); /* get a pseudo random number */
 			finder->info->idx=0x3fff & (((i & 0x07) << 8) | pat); /* make sure the mixed items end up after the ones in sequence */
-			ehitoa(i, bufftwo, 10);
-			ee_printf("  else i = %s done \n", bufftwo);
+			//ehitoa(i, bufftwo, 10);
+			//ee_printf("  else i = %s done \n", bufftwo);
 		}
 		finder=finder->next;
 	}
-	ehitoa(i, bufftwo, 10);
-	ee_printf(" i2 = %s done \n", bufftwo);
+	//ehitoa(i, bufftwo, 10);
+	//ee_printf(" i2 = %s done \n", bufftwo);
 	list = core_list_mergesort(list,cmp_idx,NULL);
 #if CORE_DEBUG
-	ee_printf("Initialized list:\n");
+	//ee_printf("Initialized list:\n");
 	finder=list;
 	while (finder) {
-		ee_printf("[%04x,%04x]",finder->info->idx,(ee_u16)finder->info->data16);
+		//ee_printf("[%04x,%04x]",finder->info->idx,(ee_u16)finder->info->data16);
 		finder=finder->next;
 	}
-	ee_printf("\n");
+	//ee_printf("\n");
 #endif
 	return list;
 }
@ -424,20 +424,22 @@ list_head *core_list_undo_remove(list_head *item_removed, list_head *item_modifi
 	Found item, or NULL if not found.
 */
 list_head *core_list_find(list_head *list,list_data *info) {
-	ee_printf("entered core_list_find \n");
+	//ee_printf("entered core_list_find \n");
 	if (info->idx>=0) {
-		ee_printf("find if \n");
+		//ee_printf("find if \n");
 		while (list && (list->info->idx != info->idx)){
 			list=list->next;
-			ee_printf("find while if \n");}
-		ee_printf("core_list_find end \n");
+			//ee_printf("find while if \n");
+			}
+		//ee_printf("core_list_find end \n");
 		return list;
 	} else {
-		ee_printf("find else");
+		//ee_printf("find else");
 		while (list && ((list->info->data16 & 0xff) != info->data16)){
 			list=list->next;
-			ee_printf("find while else \n");}
-		ee_printf("core list find end \n");
+			//ee_printf("find while else \n");
+			}
+		//ee_printf("core list find end \n");
 		return list;
 	}
 }
@ -456,7 +458,7 @@ list_head *core_list_find(list_head *list,list_data *info) {
 */

 list_head *core_list_reverse(list_head *list) {
-	ee_printf("entered core_list_reverse");
+//	ee_printf("entered core_list_reverse");
 	list_head *next=NULL, *tmp;
 	while (list) {
 		tmp=list->next;
@ -464,7 +466,7 @@ list_head *core_list_reverse(list_head *list) {
 		next=list;
 		list=tmp;
 	}
-	ee_printf("core_list_reverse done");
+	//ee_printf("core_list_reverse done");
 	return next;
 }
 /* Function: core_list_mergesort
@ -493,27 +495,27 @@ list_head *core_list_mergesort(list_head *list, list_cmp cmp, core_results *res)
    ee_s32 insize, nmerges, psize, qsize, i;

    insize = 1;
-	char bufftwo[200];
+	//char bufftwo[200];
    while (1) {
        p = list;
        list = NULL;
        tail = NULL;

        nmerges = 0;  /* count number of merges we do in this pass */
-		ehitoa(nmerges, bufftwo, 10);
-		ee_printf(" nmerges default value = %s done \n", bufftwo);
+		//ehitoa(nmerges, bufftwo, 10);
+		//ee_printf(" nmerges default value = %s done \n", bufftwo);
        while (p) {
            nmerges++;  /* there exists a merge to be done */
-			ehitoa(nmerges, bufftwo, 10);
-			ee_printf(" current nmerges = %s done \n", bufftwo);
+			//ehitoa(nmerges, bufftwo, 10);
+			//ee_printf(" current nmerges = %s done \n", bufftwo);
            /* step `insize' places along from p */
            q = p;
            psize = 0;
-			ehitoa(insize, bufftwo, 10);
-			ee_printf(" insize = %s done \n", bufftwo);
+			//ehitoa(insize, bufftwo, 10);
+			//ee_printf(" insize = %s done \n", bufftwo);
            for (i = 0; i < insize; i++) {
-				ehitoa(i, bufftwo, 10);
-				ee_printf(" i = %s done \n", bufftwo);
+				//ehitoa(i, bufftwo, 10);
+				//ee_printf(" i = %s done \n", bufftwo);
                psize++;
 			    q = q->next;
                if (!q) break;
@ -521,37 +523,37 @@ list_head *core_list_mergesort(list_head *list, list_cmp cmp, core_results *res)

            /* if q hasn't fallen off end, we have two lists to merge */
            qsize = insize;
-			ehitoa(qsize, bufftwo, 10);
-			ee_printf(" qsize = %s done \n", bufftwo);
+			//ehitoa(qsize, bufftwo, 10);
+			//ee_printf(" qsize = %s done \n", bufftwo);

            /* now we have two lists; merge them */
            while (psize > 0 || (qsize > 0 && q)) {

 				/* decide whether next element of merge comes from p or q */
 				if (psize == 0) {
-					ee_printf("if \n");
+					//ee_printf("if \n");
 				    /* p is empty; e must come from q. */
 				    e = q; q = q->next; qsize--;
 				} else if (qsize == 0 || !q) {
-					ee_printf("else if \n");
+					//ee_printf("else if \n");
 				    /* q is empty; e must come from p. */
 				    e = p; p = p->next; psize--;
 				} else if (cmp(p->info,q->info,res) <= 0) {
-					ee_printf("else if 2 \n");
+					//ee_printf("else if 2 \n");
 				    /* First element of p is lower (or same); e must come from p. */
 				    e = p; p = p->next; psize--;
 				} else {
-					ee_printf("else \n");
+					//ee_printf("else \n");
 				    /* First element of q is lower; e must come from q. */
 				    e = q; q = q->next; qsize--;
 				}

 		        /* add the next element to the merged list */
 				if (tail) {
-					ee_printf("tail if \n");
+					//ee_printf("tail if \n");
 				    tail->next = e;
 				} else {
-					ee_printf("tail else \n");
+					//ee_printf("tail else \n");
 				    list = e;
 				}
 				tail = e;
@ -569,8 +571,8 @@ list_head *core_list_mergesort(list_head *list, list_cmp cmp, core_results *res)

        /* Otherwise repeat, merging lists twice the size */
        insize *= 2;
-		ehitoa(insize, bufftwo, 10);
-		ee_printf(" insize2 = %s done \n", bufftwo);
+		//ehitoa(insize, bufftwo, 10);
+		//ee_printf(" insize2 = %s done \n", bufftwo);
    }
 #if COMPILER_REQUIRES_SORT_RETURN
 	return list;
--- a/riscv-coremark/coremark/core_main.c
+++ b/riscv-coremark/coremark/core_main.c
@ -17,396 +17,431 @@ Original Author: Shay Gal-on
 */

 /* File: core_main.c
-	This file contains the framework to acquire a block of memory, seed initial parameters, tun t he benchmark and report the results.
+        This file contains the framework to acquire a block of memory, seed
+   initial parameters, tun t he benchmark and report the results.
 */
 #include "coremark.h"

 /* Function: iterate
-	Run the benchmark for a specified number of iterations.
+        Run the benchmark for a specified number of iterations.

-	Operation:
-	For each type of benchmarked algorithm:
-		a - Initialize the data block for the algorithm.
-		b - Execute the algorithm N times.
+        Operation:
+        For each type of benchmarked algorithm:
+                a - Initialize the data block for the algorithm.
+                b - Execute the algorithm N times.

-	Returns:
-	NULL.
+        Returns:
+        NULL.
 */
-static ee_u16 list_known_crc[]   =      {(ee_u16)0xd4b0,(ee_u16)0x3340,(ee_u16)0x6a79,(ee_u16)0xe714,(ee_u16)0xe3c1};
-static ee_u16 matrix_known_crc[] =      {(ee_u16)0xbe52,(ee_u16)0x1199,(ee_u16)0x5608,(ee_u16)0x1fd7,(ee_u16)0x0747};
-static ee_u16 state_known_crc[]  =      {(ee_u16)0x5e47,(ee_u16)0x39bf,(ee_u16)0xe5a4,(ee_u16)0x8e3a,(ee_u16)0x8d84};
-int gg_printf(const char *fmt, ...);
-int sendstring(const char *p);
-void _send_char(char c);
-void *iterate(void *pres) {
-	ee_u32 i;
-	ee_u16 crc;
-	core_results *res=(core_results *)pres;
-	ee_u32 iterations=res->iterations;
-	res->crc=0;
-	res->crclist=0;
-	res->crcmatrix=0;
-	res->crcstate=0;
+static ee_u16 list_known_crc[]   = { (ee_u16)0xd4b0,
+                                   (ee_u16)0x3340,
+                                   (ee_u16)0x6a79,
+                                   (ee_u16)0xe714,
+                                   (ee_u16)0xe3c1 };
+static ee_u16 matrix_known_crc[] = { (ee_u16)0xbe52,
+                                     (ee_u16)0x1199,
+                                     (ee_u16)0x5608,
+                                     (ee_u16)0x1fd7,
+                                     (ee_u16)0x0747 };
+static ee_u16 state_known_crc[]  = { (ee_u16)0x5e47,
+                                    (ee_u16)0x39bf,
+                                    (ee_u16)0xe5a4,
+                                    (ee_u16)0x8e3a,
+                                    (ee_u16)0x8d84 };
+void *
+iterate(void *pres)
+{
+    ee_u32        i;
+    ee_u16        crc;
+    core_results *res        = (core_results *)pres;
+    ee_u32        iterations = res->iterations;
+    res->crc                 = 0;
+    res->crclist             = 0;
+    res->crcmatrix           = 0;
+    res->crcstate            = 0;

-	for (i=0; i<iterations; i++) {
-		crc=core_bench_list(res,1);
-		res->crc=crcu16(crc,res->crc);
-		crc=core_bench_list(res,-1);
-		res->crc=crcu16(crc,res->crc);
-		if (i==0) res->crclist=res->crc;
-	}
-	return NULL;
+    for (i = 0; i < iterations; i++)
+    {
+        crc      = core_bench_list(res, 1);
+        res->crc = crcu16(crc, res->crc);
+        crc      = core_bench_list(res, -1);
+        res->crc = crcu16(crc, res->crc);
+        if (i == 0)
+            res->crclist = res->crc;
+    }
+    return NULL;
 }

-#if (SEED_METHOD==SEED_ARG)
+#if (SEED_METHOD == SEED_ARG)
 ee_s32 get_seed_args(int i, int argc, char *argv[]);
-#define get_seed(x) (ee_s16)get_seed_args(x,argc,argv)
-#define get_seed_32(x) get_seed_args(x,argc,argv)
+#define get_seed(x)    (ee_s16) get_seed_args(x, argc, argv)
+#define get_seed_32(x) get_seed_args(x, argc, argv)
 #else /* via function or volatile */
 ee_s32 get_seed_32(int i);
-#define get_seed(x) (ee_s16)get_seed_32(x)
+#define get_seed(x) (ee_s16) get_seed_32(x)
 #endif

-#if (MEM_METHOD==MEM_STATIC)
+#if (MEM_METHOD == MEM_STATIC)
 ee_u8 static_memblk[TOTAL_DATA_SIZE];
 #endif
-char *mem_name[3] = {"Static","Heap","Stack"};
+char *mem_name[3] = { "Static", "Heap", "Stack" };
 /* Function: main
-	Main entry routine for the benchmark.
-	This function is responsible for the following steps:
+        Main entry routine for the benchmark.
+        This function is responsible for the following steps:

-	1 - Initialize input seeds from a source that cannot be determined at compile time.
-	2 - Initialize memory block for use.
-	3 - Run and time the benchmark.
-	4 - Report results, testing the validity of the output if the seeds are known.
+        1 - Initialize input seeds from a source that cannot be determined at
+   compile time. 2 - Initialize memory block for use. 3 - Run and time the
+   benchmark. 4 - Report results, testing the validity of the output if the
+   seeds are known.

-	Arguments:
-	1 - first seed  : Any value
-	2 - second seed : Must be identical to first for iterations to be identical
-	3 - third seed  : Any value, should be at least an order of magnitude less then the input size, but bigger then 32.
-	4 - Iterations  : Special, if set to 0, iterations will be automatically determined such that the benchmark will run between 10 to 100 secs
+        Arguments:
+        1 - first seed  : Any value
+        2 - second seed : Must be identical to first for iterations to be
+   identical 3 - third seed  : Any value, should be at least an order of
+   magnitude less then the input size, but bigger then 32. 4 - Iterations  :
+   Special, if set to 0, iterations will be automatically determined such that
+   the benchmark will run between 10 to 100 secs

 */

 #if MAIN_HAS_NOARGC
-MAIN_RETURN_TYPE main(void) {
-	int argc=0;
-	char *argv[1];
+MAIN_RETURN_TYPE
+main(void)
+{
+    int   argc = 0;
+    char *argv[1];
 #else
-MAIN_RETURN_TYPE main(int argc, char *argv[]) {
+MAIN_RETURN_TYPE
+main(int argc, char *argv[])
+{
 #endif
-	//const char s[] = "Elizabeth";
-	//ee_printf("eeprint");
-	//ee_printf("Trying to print: %d", 0);
-	/*gg_printf("Elizabeth");*/
-	//sendstring("Elizabeth");
-
-	//sendstring(s);
-	//return(0);
-	ee_u16 i,j=0,num_algorithms=0;
-	ee_s16 known_id=-1,total_errors=0;
-	ee_u16 seedcrc=0;
-	CORE_TICKS total_time;
-	core_results results[MULTITHREAD];
-#if (MEM_METHOD==MEM_STACK)
-	ee_u8 stack_memblock[TOTAL_DATA_SIZE*MULTITHREAD];
+    ee_printf("SHOWTIME\n");
+    ee_u16       i, j = 0, num_algorithms = 0;
+    ee_s16       known_id = -1, total_errors = 0;
+    ee_u16       seedcrc = 0;
+    CORE_TICKS   total_time;
+    core_results results[MULTITHREAD];
+#if (MEM_METHOD == MEM_STACK)
+    ee_u8 stack_memblock[TOTAL_DATA_SIZE * MULTITHREAD];
 #endif
-	/* first call any initializations needed */
-	portable_init(&(results[0].port), &argc, argv);
-	/* First some checks to make sure benchmark will run ok */
-	if (sizeof(struct list_head_s)>128) {
-		ee_printf("list_head structure too big for comparable data!\n");
-		return MAIN_RETURN_VAL;
-	}
-	results[0].seed1=get_seed(1);
-	results[0].seed2=get_seed(2);
-	results[0].seed3=get_seed(3);
-	results[0].iterations=get_seed_32(4);
+    /* first call any initializations needed */
+    portable_init(&(results[0].port), &argc, argv);
+    /* First some checks to make sure benchmark will run ok */
+    if (sizeof(struct list_head_s) > 128)
+    {
+        ee_printf("list_head structure too big for comparable data!\n");
+        return MAIN_RETURN_VAL;
+    }
+    results[0].seed1      = get_seed(1);
+    results[0].seed2      = get_seed(2);
+    results[0].seed3      = get_seed(3);
+    results[0].iterations = get_seed_32(4);
 #if CORE_DEBUG
-	results[0].iterations=1;
+    results[0].iterations = 1;
 #endif
-	results[0].execs=get_seed_32(5);
-	if (results[0].execs==0) { /* if not supplied, execute all algorithms */
-		results[0].execs=ALL_ALGORITHMS_MASK;
-	}
-		/* put in some default values based on one seed only for easy testing */
-	if ((results[0].seed1==0) && (results[0].seed2==0) && (results[0].seed3==0)) { /* validation run */
-		results[0].seed1=0;
-		results[0].seed2=0;
-		results[0].seed3=0x66;
-	}
-	if ((results[0].seed1==1) && (results[0].seed2==0) && (results[0].seed3==0)) { /* perfromance run */
-		results[0].seed1=0x3415;
-		results[0].seed2=0x3415;
-		results[0].seed3=0x66;
-	}
-#if (MEM_METHOD==MEM_STATIC)
-	results[0].memblock[0]=(void *)static_memblk;
-	results[0].size=TOTAL_DATA_SIZE;
-	ee_printf("%d \n total data size", TOTAL_DATA_SIZE);
-	results[0].err=0;
-	#if (MULTITHREAD>1)
-	#error "Cannot use a static data area with multiple contexts!"
-	#endif
-#elif (MEM_METHOD==MEM_MALLOC)
-	for (i=0 ; i<MULTITHREAD; i++) {
-		ee_s32 malloc_override=get_seed(7);
-		if (malloc_override != 0) 
-			results[i].size=malloc_override;
-			ee_printf("%d \n malloc datasize", malloc_override);
-		else
-			results[i].size=TOTAL_DATA_SIZE;
-		results[i].memblock[0]=portable_malloc(results[i].size);
-		results[i].seed1=results[0].seed1;
-		results[i].seed2=results[0].seed2;
-		results[i].seed3=results[0].seed3;
-		results[i].err=0;
-		results[i].execs=results[0].execs;
-	}
-#elif (MEM_METHOD==MEM_STACK)
-	for (i=0 ; i<MULTITHREAD; i++) {
-		results[i].memblock[0]=stack_memblock+i*TOTAL_DATA_SIZE;
-		results[i].size=TOTAL_DATA_SIZE;
-		results[i].seed1=results[0].seed1;
-		results[i].seed2=results[0].seed2;
-		results[i].seed3=results[0].seed3;
-		results[i].err=0;
-		results[i].execs=results[0].execs;
-	}
+    results[0].execs = get_seed_32(5);
+    if (results[0].execs == 0)
+    { /* if not supplied, execute all algorithms */
+        results[0].execs = ALL_ALGORITHMS_MASK;
+    }
+    /* put in some default values based on one seed only for easy testing */
+    if ((results[0].seed1 == 0) && (results[0].seed2 == 0)
+        && (results[0].seed3 == 0))
+    { /* perfromance run */
+        results[0].seed1 = 0;
+        results[0].seed2 = 0;
+        results[0].seed3 = 0x66;
+    }
+    if ((results[0].seed1 == 1) && (results[0].seed2 == 0)
+        && (results[0].seed3 == 0))
+    { /* validation run */
+        results[0].seed1 = 0x3415;
+        results[0].seed2 = 0x3415;
+        results[0].seed3 = 0x66;
+    }
+#if (MEM_METHOD == MEM_STATIC)
+    results[0].memblock[0] = (void *)static_memblk;
+    results[0].size        = TOTAL_DATA_SIZE;
+    results[0].err         = 0;
+#if (MULTITHREAD > 1)
+#error "Cannot use a static data area with multiple contexts!"
+#endif
+#elif (MEM_METHOD == MEM_MALLOC)
+    for (i = 0; i < MULTITHREAD; i++)
+    {
+        ee_s32 malloc_override = get_seed(7);
+        if (malloc_override != 0)
+            results[i].size = malloc_override;
+        else
+            results[i].size = TOTAL_DATA_SIZE;
+        results[i].memblock[0] = portable_malloc(results[i].size);
+        results[i].seed1       = results[0].seed1;
+        results[i].seed2       = results[0].seed2;
+        results[i].seed3       = results[0].seed3;
+        results[i].err         = 0;
+        results[i].execs       = results[0].execs;
+    }
+#elif (MEM_METHOD == MEM_STACK)
+for (i = 0; i < MULTITHREAD; i++)
+{
+    results[i].memblock[0] = stack_memblock + i * TOTAL_DATA_SIZE;
+    results[i].size        = TOTAL_DATA_SIZE;
+    results[i].seed1       = results[0].seed1;
+    results[i].seed2       = results[0].seed2;
+    results[i].seed3       = results[0].seed3;
+    results[i].err         = 0;
+    results[i].execs       = results[0].execs;
+}
 #else
 #error "Please define a way to initialize a memory block."
 #endif
-	/* Data init */ 
-	/* Find out how space much we have based on number of algorithms */
-	for (i=0; i<NUM_ALGORITHMS; i++) {
-		if ((1<<(ee_u32)i) & results[0].execs)
-			num_algorithms++;
-	}
-	for (i=0 ; i<MULTITHREAD; i++) 
-		results[i].size=results[i].size/num_algorithms;
-	/* Assign pointers */
-	for (i=0; i<NUM_ALGORITHMS; i++) {
-		ee_u32 ctx;
-		if ((1<<(ee_u32)i) & results[0].execs) {
-			for (ctx=0 ; ctx<MULTITHREAD; ctx++)
-				results[ctx].memblock[i+1]=(char *)(results[ctx].memblock[0])+results[0].size*j;
-			j++;
-		}
-	}
-	/* call inits */
-	for (i=0 ; i<MULTITHREAD; i++) {
-		if (results[i].execs & ID_LIST) {
-			ee_printf("loop");
-			ee_printf("%d \n", MULTITHREAD);
-			ee_printf("%d \n sizethread ", results[0].size);
+    /* Data init */
+    /* Find out how space much we have based on number of algorithms */
+    for (i = 0; i < NUM_ALGORITHMS; i++)
+    {
+        if ((1 << (ee_u32)i) & results[0].execs)
+            num_algorithms++;
+    }
+    for (i = 0; i < MULTITHREAD; i++)
+        results[i].size = results[i].size / num_algorithms;
+    /* Assign pointers */
+    for (i = 0; i < NUM_ALGORITHMS; i++)
+    {
+        ee_u32 ctx;
+        if ((1 << (ee_u32)i) & results[0].execs)
+        {
+            for (ctx = 0; ctx < MULTITHREAD; ctx++)
+                results[ctx].memblock[i + 1]
+                    = (char *)(results[ctx].memblock[0]) + results[0].size * j;
+            j++;
+        }
+    }
+    /* call inits */
+    for (i = 0; i < MULTITHREAD; i++)
+    {
+        if (results[i].execs & ID_LIST)
+        {
+            results[i].list = core_list_init(
+                results[0].size, results[i].memblock[1], results[i].seed1);
+        }
+        if (results[i].execs & ID_MATRIX)
+        {
+            core_init_matrix(results[0].size,
+                             results[i].memblock[2],
+                             (ee_s32)results[i].seed1
+                                 | (((ee_s32)results[i].seed2) << 16),
+                             &(results[i].mat));
+        }
+        if (results[i].execs & ID_STATE)
+        {
+            core_init_state(
+                results[0].size, results[i].seed1, results[i].memblock[3]);
+        }
+    }

-
-			results[i].list=core_list_init(results[0].size,results[i].memblock[1],results[i].seed1);
-			
-		}
-		if (results[i].execs & ID_MATRIX) {
-			core_init_matrix(results[0].size, results[i].memblock[2], (ee_s32)results[i].seed1 | (((ee_s32)results[i].seed2) << 16), &(results[i].mat) );
-		}
-		if (results[i].execs & ID_STATE) {
-			core_init_state(results[0].size,results[i].seed1,results[i].memblock[3]);
-		}
-	}
-
- /*int foreverLoop = 1;
- secs_ret timing = 0;
- int timingInt;
- ee_printf("\nENTERING FOREVER WHILE LOOP\n");
- while(foreverLoop == 1)
- {
-	 start_time();
-	 //filler
-	 stop_time();
-	 timing += time_in_secs(get_time());
-	 timingInt = (int)timing;
-	 ee_printf("Timing is %d\n", timingInt);
- }/*
-
-	/* automatically determine number of iterations if not set */
-	if (results[0].iterations==0) { 
-		secs_ret secs_passed=0;
-		ee_u32 divisor;
-		results[0].iterations=1;
-		int iterationInc = 0;
-		ee_printf("\n\nENTERING ITERATION WHILE LOOP\n");
-		while (secs_passed < (secs_ret)1) {
-			if(iterationInc != 0)
-			{
-			  results[0].iterations++;
-			}
-			ee_printf("iterations is %d\n", results[0].iterations);
-			start_time();
-			iterate(&results[0]);
-			stop_time();
-			secs_passed = time_in_secs(get_time());
-			int secs_passed_int = (int)secs_passed;
-			ee_printf("secs passed is %d\n", secs_passed_int);
-			iterationInc++;
-		}
-		ee_printf("LEAVING ITERATION WHILE LOOP!\n\n");
-		/* now we know it executes for at least 1 sec, set actual run time at about 10 secs */
-		divisor=(ee_u32)secs_passed;
-		ee_printf("divisor is %lu\n", divisor);
-		if (divisor==0) /* some machines cast float to int as 0 since this conversion is not defined by ANSI, but we know at least one second passed */
-			divisor=1;
-		results[0].iterations*=1+10/divisor;
-		ee_printf("iterations is %d\n", results[0].iterations);
-	}
-	/* perform actual benchmark */
-	ee_printf("Starting benchmark\n");
-	start_time();
-#if (MULTITHREAD>1)
-	if (default_num_contexts>MULTITHREAD) {
-		default_num_contexts=MULTITHREAD;
-	}
-	for (i=0 ; i<default_num_contexts; i++) {
-		results[i].iterations=results[0].iterations;
-		results[i].execs=results[0].execs;
-		core_start_parallel(&results[i]);
-	}
-	for (i=0 ; i<default_num_contexts; i++) {
-		core_stop_parallel(&results[i]);
-	}
+    /* automatically determine number of iterations if not set */
+    if (results[0].iterations == 0)
+    {
+        secs_ret secs_passed = 0;
+        ee_u32   divisor;
+        results[0].iterations = 1;
+        while (secs_passed < (secs_ret)1)
+        {
+            results[0].iterations *= 10;
+            start_time();
+            iterate(&results[0]);
+            stop_time();
+            secs_passed = time_in_secs(get_time());
+        }
+        /* now we know it executes for at least 1 sec, set actual run time at
+         * about 10 secs */
+        divisor = (ee_u32)secs_passed;
+        if (divisor == 0) /* some machines cast float to int as 0 since this
+                             conversion is not defined by ANSI, but we know at
+                             least one second passed */
+            divisor = 1;
+        results[0].iterations *= 1 + 10 / divisor;
+    }
+    /* perform actual benchmark */
+    start_time();
+#if (MULTITHREAD > 1)
+    if (default_num_contexts > MULTITHREAD)
+    {
+        default_num_contexts = MULTITHREAD;
+    }
+    for (i = 0; i < default_num_contexts; i++)
+    {
+        results[i].iterations = results[0].iterations;
+        results[i].execs      = results[0].execs;
+        core_start_parallel(&results[i]);
+    }
+    for (i = 0; i < default_num_contexts; i++)
+    {
+        core_stop_parallel(&results[i]);
+    }
 #else
-	iterate(&results[0]);
+    iterate(&results[0]);
 #endif
-	stop_time();
-	total_time=get_time();
-	ee_printf("total time is %u\n", total_time);
-	ee_printf("ending benchmark\n");
-	/* get a function of the input to report */
-	seedcrc=crc16(results[0].seed1,seedcrc);
-	seedcrc=crc16(results[0].seed2,seedcrc);
-	seedcrc=crc16(results[0].seed3,seedcrc);
-	seedcrc=crc16(results[0].size,seedcrc);
-	
-	switch (seedcrc) { /* test known output for common seeds */
-		case 0x8a02: /* seed1=0, seed2=0, seed3=0x66, size 2000 per algorithm */
-			known_id=0;
-			ee_printf("6k performance run parameters for coremark.\n");
-			break;
-		case 0x7b05: /*  seed1=0x3415, seed2=0x3415, seed3=0x66, size 2000 per algorithm */
-			known_id=1;
-			ee_printf("6k validation run parameters for coremark.\n");
-			break;
-		case 0x4eaf: /* seed1=0x8, seed2=0x8, seed3=0x8, size 400 per algorithm */
-			known_id=2;
-			ee_printf("Profile generation run parameters for coremark.\n");
-			break;
-		case 0xe9f5: /* seed1=0, seed2=0, seed3=0x66, size 666 per algorithm */
-			known_id=3;
-			ee_printf("2K performance run parameters for coremark.\n");
-			break;
-		case 0x18f2: /*  seed1=0x3415, seed2=0x3415, seed3=0x66, size 666 per algorithm */
-			known_id=4;
-			ee_printf("2K validation run parameters for coremark.\n");
-			break;
-		default:
-			total_errors=-1;
-			break;
-	}
-	if (known_id>=0) {
-		for (i=0 ; i<default_num_contexts; i++) {
-			results[i].err=0;
-			if ((results[i].execs & ID_LIST) && 
-				(results[i].crclist!=list_known_crc[known_id])) {
-				ee_printf("[%u]ERROR! list crc 0x%04x - should be 0x%04x\n",i,results[i].crclist,list_known_crc[known_id]);
-				results[i].err++;
-			}
-			if ((results[i].execs & ID_MATRIX) &&
-				(results[i].crcmatrix!=matrix_known_crc[known_id])) {
-				ee_printf("[%u]ERROR! matrix crc 0x%04x - should be 0x%04x\n",i,results[i].crcmatrix,matrix_known_crc[known_id]);
-				results[i].err++;
-			}
-			if ((results[i].execs & ID_STATE) &&
-				(results[i].crcstate!=state_known_crc[known_id])) {
-				ee_printf("[%u]ERROR! state crc 0x%04x - should be 0x%04x\n",i,results[i].crcstate,state_known_crc[known_id]);
-				results[i].err++;
-			}
-			total_errors+=results[i].err;
-		}
-	}
-	total_errors+=check_data_types();
-	/* and report results */
-	//ee_printf("CoreMark Size    : %lu\n", (long unsigned) results[0].size);
-	ee_printf("CoreMark Size    : %lu\n", (long unsigned) results[0].size);
-	ee_printf("Total ticks      : %lu\n", (long unsigned) total_time);
-#if HAS_FLOAT
-	ee_printf("Total time (secs): %f\n",time_in_secs(total_time));
-	if (time_in_secs(total_time) > 0)
-		ee_printf("Iterations/Sec   : %f\n",default_num_contexts*results[0].iterations/time_in_secs(total_time));
-#else 
-	ee_printf("Total time (secs): %d\n,time_in_secs(total_time)");
-	if (time_in_secs(total_time) > 0)
-		ee_printf("Iterations/Sec   : %d\n",default_num_contexts*results[0].iterations/time_in_secs(total_time));
-#endif
-	if (time_in_secs(total_time) < 10) {
-		ee_printf("ERROR! Must execute for at least 10 secs for a valid result!\n");
-		total_errors++;
-	}
+    stop_time();
+    total_time = get_time();
+    /* get a function of the input to report */
+    seedcrc = crc16(results[0].seed1, seedcrc);
+    seedcrc = crc16(results[0].seed2, seedcrc);
+    seedcrc = crc16(results[0].seed3, seedcrc);
+    seedcrc = crc16(results[0].size, seedcrc);

-	ee_printf("Iterations       : %lu\n", (long unsigned) default_num_contexts*results[0].iterations);
-	ee_printf("Compiler version : %s\n",COMPILER_VERSION);
-	ee_printf("Compiler flags   : %s\n",COMPILER_FLAGS);
-#if (MULTITHREAD>1)
-	ee_printf("Parallel %s : %d\n",PARALLEL_METHOD,default_num_contexts);
-#endif
-	ee_printf("Memory location  : %s\n",MEM_LOCATION);
-	/* output for verification */
-	ee_printf("seedcrc          : 0x%04x\n",seedcrc);
-	if (results[0].execs & ID_LIST)
-		for (i=0 ; i<default_num_contexts; i++) 
-			ee_printf("[%d]crclist       : 0x%04x\n",i,results[i].crclist);
-	if (results[0].execs & ID_MATRIX) 
-		for (i=0 ; i<default_num_contexts; i++) 
-			ee_printf("[%d]crcmatrix     : 0x%04x\n",i,results[i].crcmatrix);
-	if (results[0].execs & ID_STATE)
-		for (i=0 ; i<default_num_contexts; i++) 
-			ee_printf("[%d]crcstate      : 0x%04x\n",i,results[i].crcstate);
-	for (i=0 ; i<default_num_contexts; i++) 
-		ee_printf("[%d]crcfinal      : 0x%04x\n",i,results[i].crc);
-	if (total_errors==0) {
-		ee_printf("Correct operation validated. See README.md for run and reporting rules.\n");
+    switch (seedcrc)
+    {                /* test known output for common seeds */
+        case 0x8a02: /* seed1=0, seed2=0, seed3=0x66, size 2000 per algorithm */
+            known_id = 0;
+            ee_printf("6k performance run parameters for coremark.\n");
+            break;
+        case 0x7b05: /*  seed1=0x3415, seed2=0x3415, seed3=0x66, size 2000 per
+                        algorithm */
+            known_id = 1;
+            ee_printf("6k validation run parameters for coremark.\n");
+            break;
+        case 0x4eaf: /* seed1=0x8, seed2=0x8, seed3=0x8, size 400 per algorithm
+                      */
+            known_id = 2;
+            ee_printf("Profile generation run parameters for coremark.\n");
+            break;
+        case 0xe9f5: /* seed1=0, seed2=0, seed3=0x66, size 666 per algorithm */
+            known_id = 3;
+            ee_printf("2K performance run parameters for coremark.\n");
+            break;
+        case 0x18f2: /*  seed1=0x3415, seed2=0x3415, seed3=0x66, size 666 per
+                        algorithm */
+            known_id = 4;
+            ee_printf("2K validation run parameters for coremark.\n");
+            break;
+        default:
+            total_errors = -1;
+            break;
+    }
+    if (known_id >= 0)
+    {
+        for (i = 0; i < default_num_contexts; i++)
+        {
+            results[i].err = 0;
+            if ((results[i].execs & ID_LIST)
+                && (results[i].crclist != list_known_crc[known_id]))
+            {
+                ee_printf("[%u]ERROR! list crc 0x%04x - should be 0x%04x\n",
+                          i,
+                          results[i].crclist,
+                          list_known_crc[known_id]);
+                results[i].err++;
+            }
+            if ((results[i].execs & ID_MATRIX)
+                && (results[i].crcmatrix != matrix_known_crc[known_id]))
+            {
+                ee_printf("[%u]ERROR! matrix crc 0x%04x - should be 0x%04x\n",
+                          i,
+                          results[i].crcmatrix,
+                          matrix_known_crc[known_id]);
+                results[i].err++;
+            }
+            if ((results[i].execs & ID_STATE)
+                && (results[i].crcstate != state_known_crc[known_id]))
+            {
+                ee_printf("[%u]ERROR! state crc 0x%04x - should be 0x%04x\n",
+                          i,
+                          results[i].crcstate,
+                          state_known_crc[known_id]);
+                results[i].err++;
+            }
+            total_errors += results[i].err;
+        }
+    }
+    total_errors += check_data_types();
+    /* and report results */
+    ee_printf("CoreMark Size    : %lu\n", (long unsigned)results[0].size);
+    ee_printf("Total ticks      : %lu\n", (long unsigned)total_time);
 #if HAS_FLOAT
-		if (known_id==3) {
-			unsigned long long tmp = (unsigned long long) 1000.0*default_num_contexts*results[0].iterations/time_in_secs(total_time);
+    ee_printf("Total time (secs): %f\n", time_in_secs(total_time));
+    if (time_in_secs(total_time) > 0)
+        ee_printf("Iterations/Sec   : %f\n",
+                  default_num_contexts * results[0].iterations
+                      / time_in_secs(total_time));
+#else
+    ee_printf("Total time (secs): %d\n", time_in_secs(total_time));
+    if (time_in_secs(total_time) > 0)
+        ee_printf("Iterations/Sec   : %d\n",
+                  default_num_contexts * results[0].iterations
+                      / time_in_secs(total_time));
+#endif
+    if (time_in_secs(total_time) < 10)
+    {
+        ee_printf(
+            "ERROR! Must execute for at least 10 secs for a valid result!\n");
+        total_errors++;
+    }
+
+    ee_printf("Iterations       : %lu\n",
+              (long unsigned)default_num_contexts * results[0].iterations);
+    ee_printf("Compiler version : %s\n", COMPILER_VERSION);
+    ee_printf("Compiler flags   : %s\n", COMPILER_FLAGS);
+#if (MULTITHREAD > 1)
+    ee_printf("Parallel %s : %d\n", PARALLEL_METHOD, default_num_contexts);
+#endif
+    ee_printf("Memory location  : %s\n", MEM_LOCATION);
+    /* output for verification */
+    ee_printf("seedcrc          : 0x%04x\n", seedcrc);
+    if (results[0].execs & ID_LIST)
+        for (i = 0; i < default_num_contexts; i++)
+            ee_printf("[%d]crclist       : 0x%04x\n", i, results[i].crclist);
+    if (results[0].execs & ID_MATRIX)
+        for (i = 0; i < default_num_contexts; i++)
+            ee_printf("[%d]crcmatrix     : 0x%04x\n", i, results[i].crcmatrix);
+    if (results[0].execs & ID_STATE)
+        for (i = 0; i < default_num_contexts; i++)
+            ee_printf("[%d]crcstate      : 0x%04x\n", i, results[i].crcstate);
+    for (i = 0; i < default_num_contexts; i++)
+        ee_printf("[%d]crcfinal      : 0x%04x\n", i, results[i].crc);
+    if (total_errors == 0)
+    {
+        ee_printf(
+            "Correct operation validated. See README.md for run and reporting "
+            "rules.\n");
+#if HAS_FLOAT
+        if (known_id == 3)
+        {
+            unsigned long long tmp = (unsigned long long) 1000.0*default_num_contexts*results[0].iterations/time_in_secs(total_time);
 			secs_ret totalmsecs = time_in_secs(total_time);
 			int totalmint = (int) totalmsecs;
-			ee_printf("ELAPSED S: %d\n", totalmint);
+			ee_printf("ELAPSED TIME: %d\n", totalmint);

-			ee_printf("CoreMark 1.0 : %d / %s %s\n",tmp,COMPILER_VERSION,COMPILER_FLAGS);
+            ee_printf("CoreMark 1.0 : %d / %s %s",
+                      tmp,
+                      COMPILER_VERSION,
+                      COMPILER_FLAGS);
 #if defined(MEM_LOCATION) && !defined(MEM_LOCATION_UNSPEC)
-			ee_printf(" / %s",MEM_LOCATION);
+            ee_printf(" / %s", MEM_LOCATION);
 #else
-			ee_printf(" / %s",mem_name[MEM_METHOD]);
+            ee_printf(" / %s", mem_name[MEM_METHOD]);
 #endif

-#if (MULTITHREAD>1)
-			ee_printf(" / %d:%s",default_num_contexts,PARALLEL_METHOD);
+#if (MULTITHREAD > 1)
+            ee_printf(" / %d:%s", default_num_contexts, PARALLEL_METHOD);
 #endif
-			ee_printf("\n");
-		}
+            ee_printf("\n");
+        }
 #endif
-	}
-	if (total_errors>0)
-		ee_printf("Errors detected\n");
-	if (total_errors<0)
-		ee_printf("Cannot validate operation for these seed values, please compare with results on a known platform.\n");
+    }
+    if (total_errors > 0)
+        ee_printf("Errors detected\n");
+    if (total_errors < 0)
+        ee_printf(
+            "Cannot validate operation for these seed values, please compare "
+            "with results on a known platform.\n");

-#if (MEM_METHOD==MEM_MALLOC)
-	for (i=0 ; i<MULTITHREAD; i++) 
-		portable_free(results[i].memblock[0]);
+#if (MEM_METHOD == MEM_MALLOC)
+    for (i = 0; i < MULTITHREAD; i++)
+        portable_free(results[i].memblock[0]);
 #endif
-	/* And last call any target specific code for finalizing */
-	portable_fini(&(results[0].port));
+    /* And last call any target specific code for finalizing */
+    portable_fini(&(results[0].port));

-	return MAIN_RETURN_VAL;	
+    return MAIN_RETURN_VAL;
 }
-
-	//pls
-
-
--- a/riscv-coremark/coremark/core_matrix.c
+++ b/riscv-coremark/coremark/core_matrix.c
@ -19,290 +19,341 @@ Original Author: Shay Gal-on
 #include "coremark.h"
 /*
 Topic: Description
-	Matrix manipulation benchmark
-	
-	This very simple algorithm forms the basis of many more complex algorithms. 
-	
-	The tight inner loop is the focus of many optimizations (compiler as well as hardware based) 
-	and is thus relevant for embedded processing. 
-	
-	The total available data space will be divided to 3 parts:
-	NxN Matrix A - initialized with small values (upper 3/4 of the bits all zero).
-	NxN Matrix B - initialized with medium values (upper half of the bits all zero).
-	NxN Matrix C - used for the result.
+        Matrix manipulation benchmark

-	The actual values for A and B must be derived based on input that is not available at compile time.
+        This very simple algorithm forms the basis of many more complex
+algorithms.
+
+        The tight inner loop is the focus of many optimizations (compiler as
+well as hardware based) and is thus relevant for embedded processing.
+
+        The total available data space will be divided to 3 parts:
+        NxN Matrix A - initialized with small values (upper 3/4 of the bits all
+zero). NxN Matrix B - initialized with medium values (upper half of the bits all
+zero). NxN Matrix C - used for the result.
+
+        The actual values for A and B must be derived based on input that is not
+available at compile time.
 */
 ee_s16 matrix_test(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B, MATDAT val);
 ee_s16 matrix_sum(ee_u32 N, MATRES *C, MATDAT clipval);
-void matrix_mul_const(ee_u32 N, MATRES *C, MATDAT *A, MATDAT val);
-void matrix_mul_vect(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B);
-void matrix_mul_matrix(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B);
-void matrix_mul_matrix_bitextract(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B);
-void matrix_add_const(ee_u32 N, MATDAT *A, MATDAT val);
+void   matrix_mul_const(ee_u32 N, MATRES *C, MATDAT *A, MATDAT val);
+void   matrix_mul_vect(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B);
+void   matrix_mul_matrix(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B);
+void   matrix_mul_matrix_bitextract(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B);
+void   matrix_add_const(ee_u32 N, MATDAT *A, MATDAT val);

-#define matrix_test_next(x) (x+1)
-#define matrix_clip(x,y) ((y) ? (x) & 0x0ff : (x) & 0x0ffff)
-#define matrix_big(x) (0xf000 | (x))
-#define bit_extract(x,from,to) (((x)>>(from)) & (~(0xffffffff << (to))))
+#define matrix_test_next(x)      (x + 1)
+#define matrix_clip(x, y)        ((y) ? (x)&0x0ff : (x)&0x0ffff)
+#define matrix_big(x)            (0xf000 | (x))
+#define bit_extract(x, from, to) (((x) >> (from)) & (~(0xffffffff << (to))))

 #if CORE_DEBUG
-void printmat(MATDAT *A, ee_u32 N, char *name) {
-	ee_u32 i,j;
-	ee_printf("Matrix %s [%dx%d]:\n",name,N,N);
-	for (i=0; i<N; i++) {
-		for (j=0; j<N; j++) {
-			if (j!=0)
-				ee_printf(",");
-			ee_printf("%d",A[i*N+j]);
-		}
-		ee_printf("\n");
-	}
+void
+printmat(MATDAT *A, ee_u32 N, char *name)
+{
+    ee_u32 i, j;
+    ee_printf("Matrix %s [%dx%d]:\n", name, N, N);
+    for (i = 0; i < N; i++)
+    {
+        for (j = 0; j < N; j++)
+        {
+            if (j != 0)
+                ee_printf(",");
+            ee_printf("%d", A[i * N + j]);
+        }
+        ee_printf("\n");
+    }
 }
-void printmatC(MATRES *C, ee_u32 N, char *name) {
-	ee_u32 i,j;
-	ee_printf("Matrix %s [%dx%d]:\n",name,N,N);
-	for (i=0; i<N; i++) {
-		for (j=0; j<N; j++) {
-			if (j!=0)
-				ee_printf(",");
-			ee_printf("%d",C[i*N+j]);
-		}
-		ee_printf("\n");
-	}
+void
+printmatC(MATRES *C, ee_u32 N, char *name)
+{
+    ee_u32 i, j;
+    ee_printf("Matrix %s [%dx%d]:\n", name, N, N);
+    for (i = 0; i < N; i++)
+    {
+        for (j = 0; j < N; j++)
+        {
+            if (j != 0)
+                ee_printf(",");
+            ee_printf("%d", C[i * N + j]);
+        }
+        ee_printf("\n");
+    }
 }
 #endif
 /* Function: core_bench_matrix
-	Benchmark function
+        Benchmark function

-	Iterate <matrix_test> N times, 
-	changing the matrix values slightly by a constant amount each time.
+        Iterate <matrix_test> N times,
+        changing the matrix values slightly by a constant amount each time.
 */
-ee_u16 core_bench_matrix(mat_params *p, ee_s16 seed, ee_u16 crc) {
-	ee_u32 N=p->N;
-	MATRES *C=p->C;
-	MATDAT *A=p->A;
-	MATDAT *B=p->B;
-	MATDAT val=(MATDAT)seed;
+ee_u16
+core_bench_matrix(mat_params *p, ee_s16 seed, ee_u16 crc)
+{
+    ee_u32  N   = p->N;
+    MATRES *C   = p->C;
+    MATDAT *A   = p->A;
+    MATDAT *B   = p->B;
+    MATDAT  val = (MATDAT)seed;

-	crc=crc16(matrix_test(N,C,A,B,val),crc);
+    crc = crc16(matrix_test(N, C, A, B, val), crc);

-	return crc;
+    return crc;
 }

 /* Function: matrix_test
-	Perform matrix manipulation.
+        Perform matrix manipulation.

-	Parameters:
-	N - Dimensions of the matrix.
-	C - memory for result matrix.
-	A - input matrix
-	B - operator matrix (not changed during operations)
+        Parameters:
+        N - Dimensions of the matrix.
+        C - memory for result matrix.
+        A - input matrix
+        B - operator matrix (not changed during operations)

-	Returns:
-	A CRC value that captures all results calculated in the function.
-	In particular, crc of the value calculated on the result matrix 
-	after each step by <matrix_sum>.
+        Returns:
+        A CRC value that captures all results calculated in the function.
+        In particular, crc of the value calculated on the result matrix
+        after each step by <matrix_sum>.

-	Operation:
-	
-	1 - Add a constant value to all elements of a matrix.
-	2 - Multiply a matrix by a constant.
-	3 - Multiply a matrix by a vector.
-	4 - Multiply a matrix by a matrix.
-	5 - Add a constant value to all elements of a matrix.
+        Operation:

-	After the last step, matrix A is back to original contents.
+        1 - Add a constant value to all elements of a matrix.
+        2 - Multiply a matrix by a constant.
+        3 - Multiply a matrix by a vector.
+        4 - Multiply a matrix by a matrix.
+        5 - Add a constant value to all elements of a matrix.
+
+        After the last step, matrix A is back to original contents.
 */
-ee_s16 matrix_test(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B, MATDAT val) {
-	ee_u16 crc=0;
-	MATDAT clipval=matrix_big(val);
+ee_s16
+matrix_test(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B, MATDAT val)
+{
+    ee_u16 crc     = 0;
+    MATDAT clipval = matrix_big(val);

-	matrix_add_const(N,A,val); /* make sure data changes  */
+    matrix_add_const(N, A, val); /* make sure data changes  */
 #if CORE_DEBUG
-	printmat(A,N,"matrix_add_const");
+    printmat(A, N, "matrix_add_const");
 #endif
-	matrix_mul_const(N,C,A,val);
-	crc=crc16(matrix_sum(N,C,clipval),crc);
+    matrix_mul_const(N, C, A, val);
+    crc = crc16(matrix_sum(N, C, clipval), crc);
 #if CORE_DEBUG
-	printmatC(C,N,"matrix_mul_const");
+    printmatC(C, N, "matrix_mul_const");
 #endif
-	matrix_mul_vect(N,C,A,B);
-	crc=crc16(matrix_sum(N,C,clipval),crc);
+    matrix_mul_vect(N, C, A, B);
+    crc = crc16(matrix_sum(N, C, clipval), crc);
 #if CORE_DEBUG
-	printmatC(C,N,"matrix_mul_vect");
+    printmatC(C, N, "matrix_mul_vect");
 #endif
-	matrix_mul_matrix(N,C,A,B);
-	crc=crc16(matrix_sum(N,C,clipval),crc);
+    matrix_mul_matrix(N, C, A, B);
+    crc = crc16(matrix_sum(N, C, clipval), crc);
 #if CORE_DEBUG
-	printmatC(C,N,"matrix_mul_matrix");
+    printmatC(C, N, "matrix_mul_matrix");
 #endif
-	matrix_mul_matrix_bitextract(N,C,A,B);
-	crc=crc16(matrix_sum(N,C,clipval),crc);
+    matrix_mul_matrix_bitextract(N, C, A, B);
+    crc = crc16(matrix_sum(N, C, clipval), crc);
 #if CORE_DEBUG
-	printmatC(C,N,"matrix_mul_matrix_bitextract");
+    printmatC(C, N, "matrix_mul_matrix_bitextract");
 #endif
-	
-	matrix_add_const(N,A,-val); /* return matrix to initial value */
-	return crc;
+
+    matrix_add_const(N, A, -val); /* return matrix to initial value */
+    return crc;
 }

 /* Function : matrix_init
-	Initialize the memory block for matrix benchmarking.
+        Initialize the memory block for matrix benchmarking.

-	Parameters:
-	blksize - Size of memory to be initialized.
-	memblk - Pointer to memory block.
-	seed - Actual values chosen depend on the seed parameter.
-	p - pointers to <mat_params> containing initialized matrixes.
+        Parameters:
+        blksize - Size of memory to be initialized.
+        memblk - Pointer to memory block.
+        seed - Actual values chosen depend on the seed parameter.
+        p - pointers to <mat_params> containing initialized matrixes.

-	Returns:
-	Matrix dimensions.
-	
-	Note:
-	The seed parameter MUST be supplied from a source that cannot be determined at compile time
+        Returns:
+        Matrix dimensions.
+
+        Note:
+        The seed parameter MUST be supplied from a source that cannot be
+   determined at compile time
 */
-ee_u32 core_init_matrix(ee_u32 blksize, void *memblk, ee_s32 seed, mat_params *p) {
-	ee_u32 N=0;
-	MATDAT *A;
-	MATDAT *B;
-	ee_s32 order=1;
-	MATDAT val;
-	ee_u32 i=0,j=0;
-	if (seed==0)
-		seed=1;
-	while (j<blksize) {
-		i++;
-		j=i*i*2*4;		
-	}
-	N=i-1;
-	A=(MATDAT *)align_mem(memblk);
-	B=A+N*N;
+ee_u32
+core_init_matrix(ee_u32 blksize, void *memblk, ee_s32 seed, mat_params *p)
+{
+    ee_u32  N = 0;
+    MATDAT *A;
+    MATDAT *B;
+    ee_s32  order = 1;
+    MATDAT  val;
+    ee_u32  i = 0, j = 0;
+    if (seed == 0)
+        seed = 1;
+    while (j < blksize)
+    {
+        i++;
+        j = i * i * 2 * 4;
+    }
+    N = i - 1;
+    A = (MATDAT *)align_mem(memblk);
+    B = A + N * N;

-	for (i=0; i<N; i++) {
-		for (j=0; j<N; j++) {
-			seed = ( ( order * seed ) % 65536 );
-			val = (seed + order);
-			val=matrix_clip(val,0);
-			B[i*N+j] = val;
-			val =  (val + order);
-			val=matrix_clip(val,1);
-			A[i*N+j] = val;
-			order++;
-		}
-	}
+    for (i = 0; i < N; i++)
+    {
+        for (j = 0; j < N; j++)
+        {
+            seed         = ((order * seed) % 65536);
+            val          = (seed + order);
+            val          = matrix_clip(val, 0);
+            B[i * N + j] = val;
+            val          = (val + order);
+            val          = matrix_clip(val, 1);
+            A[i * N + j] = val;
+            order++;
+        }
+    }

-	p->A=A;
-	p->B=B;
-	p->C=(MATRES *)align_mem(B+N*N);
-	p->N=N;
+    p->A = A;
+    p->B = B;
+    p->C = (MATRES *)align_mem(B + N * N);
+    p->N = N;
 #if CORE_DEBUG
-	printmat(A,N,"A");
-	printmat(B,N,"B");
+    printmat(A, N, "A");
+    printmat(B, N, "B");
 #endif
-	return N;
+    return N;
 }

 /* Function: matrix_sum
-	Calculate a function that depends on the values of elements in the matrix.
+        Calculate a function that depends on the values of elements in the
+   matrix.

-	For each element, accumulate into a temporary variable.
-	
-	As long as this value is under the parameter clipval, 
-	add 1 to the result if the element is bigger then the previous.
-	
-	Otherwise, reset the accumulator and add 10 to the result.
+        For each element, accumulate into a temporary variable.
+
+        As long as this value is under the parameter clipval,
+        add 1 to the result if the element is bigger then the previous.
+
+        Otherwise, reset the accumulator and add 10 to the result.
 */
-ee_s16 matrix_sum(ee_u32 N, MATRES *C, MATDAT clipval) {
-	MATRES tmp=0,prev=0,cur=0;
-	ee_s16 ret=0;
-	ee_u32 i,j;
-	for (i=0; i<N; i++) {
-		for (j=0; j<N; j++) {
-			cur=C[i*N+j];
-			tmp+=cur;
-			if (tmp>clipval) {
-				ret+=10;
-				tmp=0;
-			} else {
-				ret += (cur>prev) ? 1 : 0;
-			}
-			prev=cur;
-		}
-	}
-	return ret;
+ee_s16
+matrix_sum(ee_u32 N, MATRES *C, MATDAT clipval)
+{
+    MATRES tmp = 0, prev = 0, cur = 0;
+    ee_s16 ret = 0;
+    ee_u32 i, j;
+    for (i = 0; i < N; i++)
+    {
+        for (j = 0; j < N; j++)
+        {
+            cur = C[i * N + j];
+            tmp += cur;
+            if (tmp > clipval)
+            {
+                ret += 10;
+                tmp = 0;
+            }
+            else
+            {
+                ret += (cur > prev) ? 1 : 0;
+            }
+            prev = cur;
+        }
+    }
+    return ret;
 }

 /* Function: matrix_mul_const
-	Multiply a matrix by a constant.
-	This could be used as a scaler for instance.
+        Multiply a matrix by a constant.
+        This could be used as a scaler for instance.
 */
-void matrix_mul_const(ee_u32 N, MATRES *C, MATDAT *A, MATDAT val) {
-	ee_u32 i,j;
-	for (i=0; i<N; i++) {
-		for (j=0; j<N; j++) {
-			C[i*N+j]=(MATRES)A[i*N+j] * (MATRES)val;
-		}
-	}
+void
+matrix_mul_const(ee_u32 N, MATRES *C, MATDAT *A, MATDAT val)
+{
+    ee_u32 i, j;
+    for (i = 0; i < N; i++)
+    {
+        for (j = 0; j < N; j++)
+        {
+            C[i * N + j] = (MATRES)A[i * N + j] * (MATRES)val;
+        }
+    }
 }

 /* Function: matrix_add_const
-	Add a constant value to all elements of a matrix.
+        Add a constant value to all elements of a matrix.
 */
-void matrix_add_const(ee_u32 N, MATDAT *A, MATDAT val) {
-	ee_u32 i,j;
-	for (i=0; i<N; i++) {
-		for (j=0; j<N; j++) {
-			A[i*N+j] += val;
-		}
-	}
+void
+matrix_add_const(ee_u32 N, MATDAT *A, MATDAT val)
+{
+    ee_u32 i, j;
+    for (i = 0; i < N; i++)
+    {
+        for (j = 0; j < N; j++)
+        {
+            A[i * N + j] += val;
+        }
+    }
 }

 /* Function: matrix_mul_vect
-	Multiply a matrix by a vector.
-	This is common in many simple filters (e.g. fir where a vector of coefficients is applied to the matrix.)
+        Multiply a matrix by a vector.
+        This is common in many simple filters (e.g. fir where a vector of
+   coefficients is applied to the matrix.)
 */
-void matrix_mul_vect(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B) {
-	ee_u32 i,j;
-	for (i=0; i<N; i++) {
-		C[i]=0;
-		for (j=0; j<N; j++) {
-			C[i]+=(MATRES)A[i*N+j] * (MATRES)B[j];
-		}
-	}
+void
+matrix_mul_vect(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B)
+{
+    ee_u32 i, j;
+    for (i = 0; i < N; i++)
+    {
+        C[i] = 0;
+        for (j = 0; j < N; j++)
+        {
+            C[i] += (MATRES)A[i * N + j] * (MATRES)B[j];
+        }
+    }
 }

 /* Function: matrix_mul_matrix
-	Multiply a matrix by a matrix.
-	Basic code is used in many algorithms, mostly with minor changes such as scaling.
+        Multiply a matrix by a matrix.
+        Basic code is used in many algorithms, mostly with minor changes such as
+   scaling.
 */
-void matrix_mul_matrix(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B) {
-	ee_u32 i,j,k;
-	for (i=0; i<N; i++) {
-		for (j=0; j<N; j++) {
-			C[i*N+j]=0;
-			for(k=0;k<N;k++)
-			{
-				C[i*N+j]+=(MATRES)A[i*N+k] * (MATRES)B[k*N+j];
-			}
-		}
-	}
+void
+matrix_mul_matrix(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B)
+{
+    ee_u32 i, j, k;
+    for (i = 0; i < N; i++)
+    {
+        for (j = 0; j < N; j++)
+        {
+            C[i * N + j] = 0;
+            for (k = 0; k < N; k++)
+            {
+                C[i * N + j] += (MATRES)A[i * N + k] * (MATRES)B[k * N + j];
+            }
+        }
+    }
 }

 /* Function: matrix_mul_matrix_bitextract
-	Multiply a matrix by a matrix, and extract some bits from the result.
-	Basic code is used in many algorithms, mostly with minor changes such as scaling.
+        Multiply a matrix by a matrix, and extract some bits from the result.
+        Basic code is used in many algorithms, mostly with minor changes such as
+   scaling.
 */
-void matrix_mul_matrix_bitextract(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B) {
-	ee_u32 i,j,k;
-	for (i=0; i<N; i++) {
-		for (j=0; j<N; j++) {
-			C[i*N+j]=0;
-			for(k=0;k<N;k++)
-			{
-				MATRES tmp=(MATRES)A[i*N+k] * (MATRES)B[k*N+j];
-				C[i*N+j]+=bit_extract(tmp,2,4)*bit_extract(tmp,5,7);
-			}
-		}
-	}
+void
+matrix_mul_matrix_bitextract(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B)
+{
+    ee_u32 i, j, k;
+    for (i = 0; i < N; i++)
+    {
+        for (j = 0; j < N; j++)
+        {
+            C[i * N + j] = 0;
+            for (k = 0; k < N; k++)
+            {
+                MATRES tmp = (MATRES)A[i * N + k] * (MATRES)B[k * N + j];
+                C[i * N + j] += bit_extract(tmp, 2, 4) * bit_extract(tmp, 5, 7);
+            }
+        }
+    }
 }
--- a/riscv-coremark/coremark/core_state.c
+++ b/riscv-coremark/coremark/core_state.c
@ -18,260 +18,313 @@ Original Author: Shay Gal-on

 #include "coremark.h"
 /* local functions */
-enum CORE_STATE core_state_transition( ee_u8 **instr , ee_u32 *transition_count);
+enum CORE_STATE core_state_transition(ee_u8 **instr, ee_u32 *transition_count);

 /*
 Topic: Description
-	Simple state machines like this one are used in many embedded products.
-	
-	For more complex state machines, sometimes a state transition table implementation is used instead, 
-	trading speed of direct coding for ease of maintenance.
-	
-	Since the main goal of using a state machine in CoreMark is to excercise the switch/if behaviour,
-	we are using a small moore machine. 
-	
-	In particular, this machine tests type of string input,
-	trying to determine whether the input is a number or something else.
-	(see core_state.png).
+        Simple state machines like this one are used in many embedded products.
+
+        For more complex state machines, sometimes a state transition table
+implementation is used instead, trading speed of direct coding for ease of
+maintenance.
+
+        Since the main goal of using a state machine in CoreMark is to excercise
+the switch/if behaviour, we are using a small moore machine.
+
+        In particular, this machine tests type of string input,
+        trying to determine whether the input is a number or something else.
+        (see core_state.png).
 */

 /* Function: core_bench_state
-	Benchmark function
+        Benchmark function

-	Go over the input twice, once direct, and once after introducing some corruption. 
+        Go over the input twice, once direct, and once after introducing some
+   corruption.
 */
-ee_u16 core_bench_state(ee_u32 blksize, ee_u8 *memblock, 
-		ee_s16 seed1, ee_s16 seed2, ee_s16 step, ee_u16 crc) 
+ee_u16
+core_bench_state(ee_u32 blksize,
+                 ee_u8 *memblock,
+                 ee_s16 seed1,
+                 ee_s16 seed2,
+                 ee_s16 step,
+                 ee_u16 crc)
 {
-	ee_u32 final_counts[NUM_CORE_STATES];
-	ee_u32 track_counts[NUM_CORE_STATES];
-	ee_u8 *p=memblock;
-	ee_u32 i;
-
+    ee_u32 final_counts[NUM_CORE_STATES];
+    ee_u32 track_counts[NUM_CORE_STATES];
+    ee_u8 *p = memblock;
+    ee_u32 i;

 #if CORE_DEBUG
-	ee_printf("State Bench: %d,%d,%d,%04x\n",seed1,seed2,step,crc);
+    ee_printf("State Bench: %d,%d,%d,%04x\n", seed1, seed2, step, crc);
 #endif
-	for (i=0; i<NUM_CORE_STATES; i++) {
-		final_counts[i]=track_counts[i]=0;
-	}
-	/* run the state machine over the input */
-	while (*p!=0) {
-		enum CORE_STATE fstate=core_state_transition(&p,track_counts);
-		final_counts[fstate]++;
+    for (i = 0; i < NUM_CORE_STATES; i++)
+    {
+        final_counts[i] = track_counts[i] = 0;
+    }
+    /* run the state machine over the input */
+    while (*p != 0)
+    {
+        enum CORE_STATE fstate = core_state_transition(&p, track_counts);
+        final_counts[fstate]++;
 #if CORE_DEBUG
-	ee_printf("%d,",fstate);
-	}
-	ee_printf("\n");
+        ee_printf("%d,", fstate);
+    }
+    ee_printf("\n");
 #else
-	}
+    }
 #endif
-	p=memblock;
-	while (p < (memblock+blksize)) { /* insert some corruption */
-		if (*p!=',')
-			*p^=(ee_u8)seed1;
-		p+=step;
-	}
-	p=memblock;
-	/* run the state machine over the input again */
-	while (*p!=0) {
-		enum CORE_STATE fstate=core_state_transition(&p,track_counts);
-		final_counts[fstate]++;
+    p = memblock;
+    while (p < (memblock + blksize))
+    { /* insert some corruption */
+        if (*p != ',')
+            *p ^= (ee_u8)seed1;
+        p += step;
+    }
+    p = memblock;
+    /* run the state machine over the input again */
+    while (*p != 0)
+    {
+        enum CORE_STATE fstate = core_state_transition(&p, track_counts);
+        final_counts[fstate]++;
 #if CORE_DEBUG
-	ee_printf("%d,",fstate);
-	}
-	ee_printf("\n");
+        ee_printf("%d,", fstate);
+    }
+    ee_printf("\n");
 #else
-	}
+    }
 #endif
-	p=memblock;
-	while (p < (memblock+blksize)) { /* undo corruption is seed1 and seed2 are equal */
-		if (*p!=',')
-			*p^=(ee_u8)seed2;
-		p+=step;
-	}
-	/* end timing */
-	for (i=0; i<NUM_CORE_STATES; i++) {
-		crc=crcu32(final_counts[i],crc);
-		crc=crcu32(track_counts[i],crc);
-	}
-	return crc;
+    p = memblock;
+    while (p < (memblock + blksize))
+    { /* undo corruption is seed1 and seed2 are equal */
+        if (*p != ',')
+            *p ^= (ee_u8)seed2;
+        p += step;
+    }
+    /* end timing */
+    for (i = 0; i < NUM_CORE_STATES; i++)
+    {
+        crc = crcu32(final_counts[i], crc);
+        crc = crcu32(track_counts[i], crc);
+    }
+    return crc;
 }

 /* Default initialization patterns */
-static ee_u8 *intpat[4]  ={(ee_u8 *)"5012",(ee_u8 *)"1234",(ee_u8 *)"-874",(ee_u8 *)"+122"};
-static ee_u8 *floatpat[4]={(ee_u8 *)"35.54400",(ee_u8 *)".1234500",(ee_u8 *)"-110.700",(ee_u8 *)"+0.64400"};
-static ee_u8 *scipat[4]  ={(ee_u8 *)"5.500e+3",(ee_u8 *)"-.123e-2",(ee_u8 *)"-87e+832",(ee_u8 *)"+0.6e-12"};
-static ee_u8 *errpat[4]  ={(ee_u8 *)"T0.3e-1F",(ee_u8 *)"-T.T++Tq",(ee_u8 *)"1T3.4e4z",(ee_u8 *)"34.0e-T^"};
+static ee_u8 *intpat[4]
+    = { (ee_u8 *)"5012", (ee_u8 *)"1234", (ee_u8 *)"-874", (ee_u8 *)"+122" };
+static ee_u8 *floatpat[4] = { (ee_u8 *)"35.54400",
+                              (ee_u8 *)".1234500",
+                              (ee_u8 *)"-110.700",
+                              (ee_u8 *)"+0.64400" };
+static ee_u8 *scipat[4]   = { (ee_u8 *)"5.500e+3",
+                            (ee_u8 *)"-.123e-2",
+                            (ee_u8 *)"-87e+832",
+                            (ee_u8 *)"+0.6e-12" };
+static ee_u8 *errpat[4]   = { (ee_u8 *)"T0.3e-1F",
+                            (ee_u8 *)"-T.T++Tq",
+                            (ee_u8 *)"1T3.4e4z",
+                            (ee_u8 *)"34.0e-T^" };

 /* Function: core_init_state
-	Initialize the input data for the state machine.
+        Initialize the input data for the state machine.

-	Populate the input with several predetermined strings, interspersed.
-	Actual patterns chosen depend on the seed parameter.
-	
-	Note:
-	The seed parameter MUST be supplied from a source that cannot be determined at compile time
+        Populate the input with several predetermined strings, interspersed.
+        Actual patterns chosen depend on the seed parameter.
+
+        Note:
+        The seed parameter MUST be supplied from a source that cannot be
+   determined at compile time
 */
-void core_init_state(ee_u32 size, ee_s16 seed, ee_u8 *p) {
-	ee_u32 total=0,next=0,i;
-	ee_u8 *buf=0;
+void
+core_init_state(ee_u32 size, ee_s16 seed, ee_u8 *p)
+{
+    ee_u32 total = 0, next = 0, i;
+    ee_u8 *buf = 0;
 #if CORE_DEBUG
-	ee_u8 *start=p;
-	ee_printf("State: %d,%d\n",size,seed);
+    ee_u8 *start = p;
+    ee_printf("State: %d,%d\n", size, seed);
 #endif
-	size--;
-	next=0;
-	while ((total+next+1)<size) {
-		if (next>0) {
-			for(i=0;i<next;i++)
-				*(p+total+i)=buf[i];
-			*(p+total+i)=',';
-			total+=next+1;
-		}
-		seed++;
-		switch (seed & 0x7) {
-			case 0: /* int */
-			case 1: /* int */
-			case 2: /* int */
-				buf=intpat[(seed>>3) & 0x3];
-				next=4;
-			break;
-			case 3: /* float */
-			case 4: /* float */
-				buf=floatpat[(seed>>3) & 0x3];
-				next=8;
-			break;
-			case 5: /* scientific */
-			case 6: /* scientific */
-				buf=scipat[(seed>>3) & 0x3];
-				next=8;
-			break;
-			case 7: /* invalid */
-				buf=errpat[(seed>>3) & 0x3];
-				next=8;
-			break;
-			default: /* Never happen, just to make some compilers happy */
-			break;
-		}
-	}
-	size++;
-	while (total<size) { /* fill the rest with 0 */
-		*(p+total)=0;
-		total++;
-	}
+    size--;
+    next = 0;
+    while ((total + next + 1) < size)
+    {
+        if (next > 0)
+        {
+            for (i = 0; i < next; i++)
+                *(p + total + i) = buf[i];
+            *(p + total + i) = ',';
+            total += next + 1;
+        }
+        seed++;
+        switch (seed & 0x7)
+        {
+            case 0: /* int */
+            case 1: /* int */
+            case 2: /* int */
+                buf  = intpat[(seed >> 3) & 0x3];
+                next = 4;
+                break;
+            case 3: /* float */
+            case 4: /* float */
+                buf  = floatpat[(seed >> 3) & 0x3];
+                next = 8;
+                break;
+            case 5: /* scientific */
+            case 6: /* scientific */
+                buf  = scipat[(seed >> 3) & 0x3];
+                next = 8;
+                break;
+            case 7: /* invalid */
+                buf  = errpat[(seed >> 3) & 0x3];
+                next = 8;
+                break;
+            default: /* Never happen, just to make some compilers happy */
+                break;
+        }
+    }
+    size++;
+    while (total < size)
+    { /* fill the rest with 0 */
+        *(p + total) = 0;
+        total++;
+    }
 #if CORE_DEBUG
-	ee_printf("State Input: %s\n",start);
+    ee_printf("State Input: %s\n", start);
 #endif
 }

-static ee_u8 ee_isdigit(ee_u8 c) {
-	ee_u8 retval;
-	retval = ((c>='0') & (c<='9')) ? 1 : 0;
-	return retval;
+static ee_u8
+ee_isdigit(ee_u8 c)
+{
+    ee_u8 retval;
+    retval = ((c >= '0') & (c <= '9')) ? 1 : 0;
+    return retval;
 }

 /* Function: core_state_transition
-	Actual state machine.
+        Actual state machine.

-	The state machine will continue scanning until either:
-	1 - an invalid input is detcted.
-	2 - a valid number has been detected.
-	
-	The input pointer is updated to point to the end of the token, and the end state is returned (either specific format determined or invalid).
+        The state machine will continue scanning until either:
+        1 - an invalid input is detcted.
+        2 - a valid number has been detected.
+
+        The input pointer is updated to point to the end of the token, and the
+   end state is returned (either specific format determined or invalid).
 */

-enum CORE_STATE core_state_transition( ee_u8 **instr , ee_u32 *transition_count) {
-	ee_u8 *str=*instr;
-	ee_u8 NEXT_SYMBOL;
-	enum CORE_STATE state=CORE_START;
-	for( ; *str && state != CORE_INVALID; str++ ) {
-		NEXT_SYMBOL = *str;
-		if (NEXT_SYMBOL==',') /* end of this input */ {
-			str++;
-			break;
-		}
-		switch(state) {
-		case CORE_START:
-			if(ee_isdigit(NEXT_SYMBOL)) {
-				state = CORE_INT;
-			}
-			else if( NEXT_SYMBOL == '+' || NEXT_SYMBOL == '-' ) {
-				state = CORE_S1;
-			}
-			else if( NEXT_SYMBOL == '.' ) {
-				state = CORE_FLOAT;
-			}
-			else {
-				state = CORE_INVALID;
-				transition_count[CORE_INVALID]++;
-			}
-			transition_count[CORE_START]++;
-			break;
-		case CORE_S1:
-			if(ee_isdigit(NEXT_SYMBOL)) {
-				state = CORE_INT;
-				transition_count[CORE_S1]++;
-			}
-			else if( NEXT_SYMBOL == '.' ) {
-				state = CORE_FLOAT;
-				transition_count[CORE_S1]++;
-			}
-			else {
-				state = CORE_INVALID;
-				transition_count[CORE_S1]++;
-			}
-			break;
-		case CORE_INT:
-			if( NEXT_SYMBOL == '.' ) {
-				state = CORE_FLOAT;
-				transition_count[CORE_INT]++;
-			}
-			else if(!ee_isdigit(NEXT_SYMBOL)) {
-				state = CORE_INVALID;
-				transition_count[CORE_INT]++;
-			}
-			break;
-		case CORE_FLOAT:
-			if( NEXT_SYMBOL == 'E' || NEXT_SYMBOL == 'e' ) {
-				state = CORE_S2;
-				transition_count[CORE_FLOAT]++;
-			}
-			else if(!ee_isdigit(NEXT_SYMBOL)) {
-				state = CORE_INVALID;
-				transition_count[CORE_FLOAT]++;
-			}
-			break;
-		case CORE_S2:
-			if( NEXT_SYMBOL == '+' || NEXT_SYMBOL == '-' ) {
-				state = CORE_EXPONENT;
-				transition_count[CORE_S2]++;
-			}
-			else {
-				state = CORE_INVALID;
-				transition_count[CORE_S2]++;
-			}
-			break;
-		case CORE_EXPONENT:
-			if(ee_isdigit(NEXT_SYMBOL)) {
-				state = CORE_SCIENTIFIC;
-				transition_count[CORE_EXPONENT]++;
-			}
-			else {
-				state = CORE_INVALID;
-				transition_count[CORE_EXPONENT]++;
-			}
-			break;
-		case CORE_SCIENTIFIC:
-			if(!ee_isdigit(NEXT_SYMBOL)) {
-				state = CORE_INVALID;
-				transition_count[CORE_INVALID]++;
-			}
-			break;
-		default:
-			break;
-		}
-	}
-	*instr=str;
-	return state;
+enum CORE_STATE
+core_state_transition(ee_u8 **instr, ee_u32 *transition_count)
+{
+    ee_u8 *         str = *instr;
+    ee_u8           NEXT_SYMBOL;
+    enum CORE_STATE state = CORE_START;
+    for (; *str && state != CORE_INVALID; str++)
+    {
+        NEXT_SYMBOL = *str;
+        if (NEXT_SYMBOL == ',') /* end of this input */
+        {
+            str++;
+            break;
+        }
+        switch (state)
+        {
+            case CORE_START:
+                if (ee_isdigit(NEXT_SYMBOL))
+                {
+                    state = CORE_INT;
+                }
+                else if (NEXT_SYMBOL == '+' || NEXT_SYMBOL == '-')
+                {
+                    state = CORE_S1;
+                }
+                else if (NEXT_SYMBOL == '.')
+                {
+                    state = CORE_FLOAT;
+                }
+                else
+                {
+                    state = CORE_INVALID;
+                    transition_count[CORE_INVALID]++;
+                }
+                transition_count[CORE_START]++;
+                break;
+            case CORE_S1:
+                if (ee_isdigit(NEXT_SYMBOL))
+                {
+                    state = CORE_INT;
+                    transition_count[CORE_S1]++;
+                }
+                else if (NEXT_SYMBOL == '.')
+                {
+                    state = CORE_FLOAT;
+                    transition_count[CORE_S1]++;
+                }
+                else
+                {
+                    state = CORE_INVALID;
+                    transition_count[CORE_S1]++;
+                }
+                break;
+            case CORE_INT:
+                if (NEXT_SYMBOL == '.')
+                {
+                    state = CORE_FLOAT;
+                    transition_count[CORE_INT]++;
+                }
+                else if (!ee_isdigit(NEXT_SYMBOL))
+                {
+                    state = CORE_INVALID;
+                    transition_count[CORE_INT]++;
+                }
+                break;
+            case CORE_FLOAT:
+                if (NEXT_SYMBOL == 'E' || NEXT_SYMBOL == 'e')
+                {
+                    state = CORE_S2;
+                    transition_count[CORE_FLOAT]++;
+                }
+                else if (!ee_isdigit(NEXT_SYMBOL))
+                {
+                    state = CORE_INVALID;
+                    transition_count[CORE_FLOAT]++;
+                }
+                break;
+            case CORE_S2:
+                if (NEXT_SYMBOL == '+' || NEXT_SYMBOL == '-')
+                {
+                    state = CORE_EXPONENT;
+                    transition_count[CORE_S2]++;
+                }
+                else
+                {
+                    state = CORE_INVALID;
+                    transition_count[CORE_S2]++;
+                }
+                break;
+            case CORE_EXPONENT:
+                if (ee_isdigit(NEXT_SYMBOL))
+                {
+                    state = CORE_SCIENTIFIC;
+                    transition_count[CORE_EXPONENT]++;
+                }
+                else
+                {
+                    state = CORE_INVALID;
+                    transition_count[CORE_EXPONENT]++;
+                }
+                break;
+            case CORE_SCIENTIFIC:
+                if (!ee_isdigit(NEXT_SYMBOL))
+                {
+                    state = CORE_INVALID;
+                    transition_count[CORE_INVALID]++;
+                }
+                break;
+            default:
+                break;
+        }
+    }
+    *instr = str;
+    return state;
 }
--- a/riscv-coremark/coremark/core_util.c
+++ b/riscv-coremark/coremark/core_util.c
@ -18,193 +18,232 @@ Original Author: Shay Gal-on

 #include "coremark.h"
 /* Function: get_seed
-	Get a values that cannot be determined at compile time.
+        Get a values that cannot be determined at compile time.

-	Since different embedded systems and compilers are used, 3 different methods are provided:
-	1 - Using a volatile variable. This method is only valid if the compiler is forced to generate code that
-	reads the value of a volatile variable from memory at run time. 
-	Please note, if using this method, you would need to modify core_portme.c to generate training profile.
-	2 - Command line arguments. This is the preferred method if command line arguments are supported.
-	3 - System function. If none of the first 2 methods is available on the platform,
-	a system function which is not a stub can be used.
-	
-	e.g. read the value on GPIO pins connected to switches, or invoke special simulator functions.
+        Since different embedded systems and compilers are used, 3 different
+   methods are provided: 1 - Using a volatile variable. This method is only
+   valid if the compiler is forced to generate code that reads the value of a
+   volatile variable from memory at run time. Please note, if using this method,
+   you would need to modify core_portme.c to generate training profile. 2 -
+   Command line arguments. This is the preferred method if command line
+   arguments are supported. 3 - System function. If none of the first 2 methods
+   is available on the platform, a system function which is not a stub can be
+   used.
+
+        e.g. read the value on GPIO pins connected to switches, or invoke
+   special simulator functions.
 */
-#if (SEED_METHOD==SEED_VOLATILE)
-	extern volatile ee_s32 seed1_volatile;
-	extern volatile ee_s32 seed2_volatile;
-	extern volatile ee_s32 seed3_volatile;
-	extern volatile ee_s32 seed4_volatile;
-	extern volatile ee_s32 seed5_volatile;
-	ee_s32 get_seed_32(int i) {
-		ee_s32 retval;
-		switch (i) {
-			case 1:
-				retval=seed1_volatile;
-				break;
-			case 2:
-				retval=seed2_volatile;
-				break;
-			case 3:
-				retval=seed3_volatile;
-				break;
-			case 4:
-				retval=seed4_volatile;
-				break;
-			case 5:
-				retval=seed5_volatile;
-				break;
-			default:
-				retval=0;
-				break;
-		}
-		return retval;
-	}
-#elif (SEED_METHOD==SEED_ARG)
-ee_s32 parseval(char *valstring) {
-	ee_s32 retval=0;
-	ee_s32 neg=1;
-	int hexmode=0;
-	if (*valstring == '-') {
-		neg=-1;
-		valstring++;
-	}
-	if ((valstring[0] == '0') && (valstring[1] == 'x')) {
-		hexmode=1;
-		valstring+=2;
-	}
-		/* first look for digits */
-	if (hexmode) {
-		while (((*valstring >= '0') && (*valstring <= '9')) || ((*valstring >= 'a') && (*valstring <= 'f'))) {
-			ee_s32 digit=*valstring-'0';
-			if (digit>9)
-				digit=10+*valstring-'a';
-			retval*=16;
-			retval+=digit;
-			valstring++;
-		}
-	} else {
-		while ((*valstring >= '0') && (*valstring <= '9')) {
-			ee_s32 digit=*valstring-'0';
-			retval*=10;
-			retval+=digit;
-			valstring++;
-		}
-	}
-	/* now add qualifiers */
-	if (*valstring=='K')
-		retval*=1024;
-	if (*valstring=='M')
-		retval*=1024*1024;
+#if (SEED_METHOD == SEED_VOLATILE)
+extern volatile ee_s32 seed1_volatile;
+extern volatile ee_s32 seed2_volatile;
+extern volatile ee_s32 seed3_volatile;
+extern volatile ee_s32 seed4_volatile;
+extern volatile ee_s32 seed5_volatile;
+ee_s32
+get_seed_32(int i)
+{
+    ee_s32 retval;
+    switch (i)
+    {
+        case 1:
+            retval = seed1_volatile;
+            break;
+        case 2:
+            retval = seed2_volatile;
+            break;
+        case 3:
+            retval = seed3_volatile;
+            break;
+        case 4:
+            retval = seed4_volatile;
+            break;
+        case 5:
+            retval = seed5_volatile;
+            break;
+        default:
+            retval = 0;
+            break;
+    }
+    return retval;
+}
+#elif (SEED_METHOD == SEED_ARG)
+ee_s32
+parseval(char *valstring)
+{
+    ee_s32 retval  = 0;
+    ee_s32 neg     = 1;
+    int    hexmode = 0;
+    if (*valstring == '-')
+    {
+        neg = -1;
+        valstring++;
+    }
+    if ((valstring[0] == '0') && (valstring[1] == 'x'))
+    {
+        hexmode = 1;
+        valstring += 2;
+    }
+    /* first look for digits */
+    if (hexmode)
+    {
+        while (((*valstring >= '0') && (*valstring <= '9'))
+               || ((*valstring >= 'a') && (*valstring <= 'f')))
+        {
+            ee_s32 digit = *valstring - '0';
+            if (digit > 9)
+                digit = 10 + *valstring - 'a';
+            retval *= 16;
+            retval += digit;
+            valstring++;
+        }
+    }
+    else
+    {
+        while ((*valstring >= '0') && (*valstring <= '9'))
+        {
+            ee_s32 digit = *valstring - '0';
+            retval *= 10;
+            retval += digit;
+            valstring++;
+        }
+    }
+    /* now add qualifiers */
+    if (*valstring == 'K')
+        retval *= 1024;
+    if (*valstring == 'M')
+        retval *= 1024 * 1024;

-	retval*=neg;
-	return retval;
+    retval *= neg;
+    return retval;
 }

-ee_s32 get_seed_args(int i, int argc, char *argv[]) {
-	if (argc>i)
-		return parseval(argv[i]);
-	return 0;
+ee_s32
+get_seed_args(int i, int argc, char *argv[])
+{
+    if (argc > i)
+        return parseval(argv[i]);
+    return 0;
 }

-#elif (SEED_METHOD==SEED_FUNC)
-/* If using OS based function, you must define and implement the functions below in core_portme.h and core_portme.c ! */
-ee_s32 get_seed_32(int i) {
-	ee_s32 retval;
-	switch (i) {
-		case 1:
-			retval=portme_sys1();
-			break;
-		case 2:
-			retval=portme_sys2();
-			break;
-		case 3:
-			retval=portme_sys3();
-			break;
-		case 4:
-			retval=portme_sys4();
-			break;
-		case 5:
-			retval=portme_sys5();
-			break;
-		default:
-			retval=0;
-			break;
-	}
-	return retval;
+#elif (SEED_METHOD == SEED_FUNC)
+/* If using OS based function, you must define and implement the functions below
+ * in core_portme.h and core_portme.c ! */
+ee_s32
+get_seed_32(int i)
+{
+    ee_s32 retval;
+    switch (i)
+    {
+        case 1:
+            retval = portme_sys1();
+            break;
+        case 2:
+            retval = portme_sys2();
+            break;
+        case 3:
+            retval = portme_sys3();
+            break;
+        case 4:
+            retval = portme_sys4();
+            break;
+        case 5:
+            retval = portme_sys5();
+            break;
+        default:
+            retval = 0;
+            break;
+    }
+    return retval;
 }
 #endif

 /* Function: crc*
-	Service functions to calculate 16b CRC code.
+        Service functions to calculate 16b CRC code.

 */
-ee_u16 crcu8(ee_u8 data, ee_u16 crc )
+ee_u16
+crcu8(ee_u8 data, ee_u16 crc)
 {
-	ee_u8 i=0,x16=0,carry=0;
+    ee_u8 i = 0, x16 = 0, carry = 0;

-	for (i = 0; i < 8; i++)
+    for (i = 0; i < 8; i++)
    {
-		x16 = (ee_u8)((data & 1) ^ ((ee_u8)crc & 1));
-		data >>= 1;
+        x16 = (ee_u8)((data & 1) ^ ((ee_u8)crc & 1));
+        data >>= 1;

-		if (x16 == 1)
-		{
-		   crc ^= 0x4002;
-		   carry = 1;
-		}
-		else 
-			carry = 0;
-		crc >>= 1;
-		if (carry)
-		   crc |= 0x8000;
-		else
-		   crc &= 0x7fff;
+        if (x16 == 1)
+        {
+            crc ^= 0x4002;
+            carry = 1;
+        }
+        else
+            carry = 0;
+        crc >>= 1;
+        if (carry)
+            crc |= 0x8000;
+        else
+            crc &= 0x7fff;
    }
-	return crc;
-} 
-ee_u16 crcu16(ee_u16 newval, ee_u16 crc) {
-	crc=crcu8( (ee_u8) (newval)				,crc);
-	crc=crcu8( (ee_u8) ((newval)>>8)	,crc);
-	return crc;
+    return crc;
 }
-ee_u16 crcu32(ee_u32 newval, ee_u16 crc) {
-	crc=crc16((ee_s16) newval		,crc);
-	crc=crc16((ee_s16) (newval>>16)	,crc);
-	return crc;
+ee_u16
+crcu16(ee_u16 newval, ee_u16 crc)
+{
+    crc = crcu8((ee_u8)(newval), crc);
+    crc = crcu8((ee_u8)((newval) >> 8), crc);
+    return crc;
 }
-ee_u16 crc16(ee_s16 newval, ee_u16 crc) {
-	return crcu16((ee_u16)newval, crc);
+ee_u16
+crcu32(ee_u32 newval, ee_u16 crc)
+{
+    crc = crc16((ee_s16)newval, crc);
+    crc = crc16((ee_s16)(newval >> 16), crc);
+    return crc;
+}
+ee_u16
+crc16(ee_s16 newval, ee_u16 crc)
+{
+    return crcu16((ee_u16)newval, crc);
 }

-ee_u8 check_data_types() {
-	ee_u8 retval=0;
-	if (sizeof(ee_u8) != 1) {
-		ee_printf("ERROR: ee_u8 is not an 8b datatype!\n");
-		retval++;
-	}
-	if (sizeof(ee_u16) != 2) {
-		ee_printf("ERROR: ee_u16 is not a 16b datatype!\n");
-		retval++;
-	}
-	if (sizeof(ee_s16) != 2) {
-		ee_printf("ERROR: ee_s16 is not a 16b datatype!\n");
-		retval++;
-	}
-	if (sizeof(ee_s32) != 4) {
-		ee_printf("ERROR: ee_s32 is not a 32b datatype!\n");
-		retval++;
-	}
-	if (sizeof(ee_u32) != 4) {
-		ee_printf("ERROR: ee_u32 is not a 32b datatype!\n");
-		retval++;
-	}
-	if (sizeof(ee_ptr_int) != sizeof(int *)) {
-		ee_printf("ERROR: ee_ptr_int is not a datatype that holds an int pointer!\n");
-		retval++;
-	}
-	if (retval>0) {
-		ee_printf("ERROR: Please modify the datatypes in core_portme.h!\n");
-	}
-	return retval;
+ee_u8
+check_data_types()
+{
+    ee_u8 retval = 0;
+    if (sizeof(ee_u8) != 1)
+    {
+        ee_printf("ERROR: ee_u8 is not an 8b datatype!\n");
+        retval++;
+    }
+    if (sizeof(ee_u16) != 2)
+    {
+        ee_printf("ERROR: ee_u16 is not a 16b datatype!\n");
+        retval++;
+    }
+    if (sizeof(ee_s16) != 2)
+    {
+        ee_printf("ERROR: ee_s16 is not a 16b datatype!\n");
+        retval++;
+    }
+    if (sizeof(ee_s32) != 4)
+    {
+        ee_printf("ERROR: ee_s32 is not a 32b datatype!\n");
+        retval++;
+    }
+    if (sizeof(ee_u32) != 4)
+    {
+        ee_printf("ERROR: ee_u32 is not a 32b datatype!\n");
+        retval++;
+    }
+    if (sizeof(ee_ptr_int) != sizeof(int *))
+    {
+        ee_printf(
+            "ERROR: ee_ptr_int is not a datatype that holds an int pointer!\n");
+        retval++;
+    }
+    if (retval > 0)
+    {
+        ee_printf("ERROR: Please modify the datatypes in core_portme.h!\n");
+    }
+    return retval;
 }
--- a/riscv-coremark/coremark/coremark.h
+++ b/riscv-coremark/coremark/coremark.h
@ -17,23 +17,23 @@ Original Author: Shay Gal-on
 */

 /* Topic: Description
-	This file contains  declarations of the various benchmark functions.
+        This file contains  declarations of the various benchmark functions.
 */

 /* Configuration: TOTAL_DATA_SIZE
-	Define total size for data algorithms will operate on
+        Define total size for data algorithms will operate on
 */
-#ifndef TOTAL_DATA_SIZE 
-#define TOTAL_DATA_SIZE 2*1000
+#ifndef TOTAL_DATA_SIZE
+#define TOTAL_DATA_SIZE 2 * 1000
 #endif

-#define SEED_ARG 0
-#define SEED_FUNC 1
+#define SEED_ARG      0
+#define SEED_FUNC     1
 #define SEED_VOLATILE 2

 #define MEM_STATIC 0
 #define MEM_MALLOC 1
-#define MEM_STACK 2
+#define MEM_STACK  2

 #include "core_portme.h"

@ -48,8 +48,8 @@ Original Author: Shay Gal-on
 void *iterate(void *pres);

 /* Typedef: secs_ret
-	For machines that have floating point support, get number of seconds as a double. 
-	Otherwise an unsigned int.
+        For machines that have floating point support, get number of seconds as
+   a double. Otherwise an unsigned int.
 */
 #if HAS_FLOAT
 typedef double secs_ret;
@ -58,47 +58,48 @@ typedef ee_u32 secs_ret;
 #endif

 #if MAIN_HAS_NORETURN
-#define MAIN_RETURN_VAL 
+#define MAIN_RETURN_VAL
 #define MAIN_RETURN_TYPE void
 #else
-#define MAIN_RETURN_VAL 0
+#define MAIN_RETURN_VAL  0
 #define MAIN_RETURN_TYPE int
-#endif 
+#endif

-void start_time(void);
-void stop_time(void);
+void       start_time(void);
+void       stop_time(void);
 CORE_TICKS get_time(void);
-secs_ret time_in_secs(CORE_TICKS ticks);
+secs_ret   time_in_secs(CORE_TICKS ticks);

 /* Misc useful functions */
 ee_u16 crcu8(ee_u8 data, ee_u16 crc);
 ee_u16 crc16(ee_s16 newval, ee_u16 crc);
 ee_u16 crcu16(ee_u16 newval, ee_u16 crc);
 ee_u16 crcu32(ee_u32 newval, ee_u16 crc);
-ee_u8 check_data_types();
-void *portable_malloc(ee_size_t size);
-void portable_free(void *p);
+ee_u8  check_data_types(void);
+void * portable_malloc(ee_size_t size);
+void   portable_free(void *p);
 ee_s32 parseval(char *valstring);

 /* Algorithm IDS */
-#define ID_LIST 	(1<<0)
-#define ID_MATRIX 	(1<<1)
-#define ID_STATE 	(1<<2)
-#define ALL_ALGORITHMS_MASK (ID_LIST|ID_MATRIX|ID_STATE)
-#define NUM_ALGORITHMS 3
+#define ID_LIST             (1 << 0)
+#define ID_MATRIX           (1 << 1)
+#define ID_STATE            (1 << 2)
+#define ALL_ALGORITHMS_MASK (ID_LIST | ID_MATRIX | ID_STATE)
+#define NUM_ALGORITHMS      3

 /* list data structures */
-typedef struct list_data_s {
-	ee_s16 data16;
-	ee_s16 idx;
+typedef struct list_data_s
+{
+    ee_s16 data16;
+    ee_s16 idx;
 } list_data;

-typedef struct list_head_s {
-	struct list_head_s *next;
-	struct list_data_s *info;
+typedef struct list_head_s
+{
+    struct list_head_s *next;
+    struct list_data_s *info;
 } list_head;

-
 /*matrix benchmark related stuff */
 #define MATDAT_INT 1
 #if MATDAT_INT
@ -109,66 +110,74 @@ typedef ee_f16 MATDAT;
 typedef ee_f32 MATRES;
 #endif

-typedef struct MAT_PARAMS_S {
-	int N;
-	MATDAT *A;
-	MATDAT *B;
-	MATRES *C;
+typedef struct MAT_PARAMS_S
+{
+    int     N;
+    MATDAT *A;
+    MATDAT *B;
+    MATRES *C;
 } mat_params;

 /* state machine related stuff */
 /* List of all the possible states for the FSM */
-typedef enum CORE_STATE {
-	CORE_START=0,
-	CORE_INVALID,
-	CORE_S1,
-	CORE_S2,
-	CORE_INT,
-	CORE_FLOAT,
-	CORE_EXPONENT,
-	CORE_SCIENTIFIC,
-	NUM_CORE_STATES
-} core_state_e ;
+typedef enum CORE_STATE
+{
+    CORE_START = 0,
+    CORE_INVALID,
+    CORE_S1,
+    CORE_S2,
+    CORE_INT,
+    CORE_FLOAT,
+    CORE_EXPONENT,
+    CORE_SCIENTIFIC,
+    NUM_CORE_STATES
+} core_state_e;

-		
 /* Helper structure to hold results */
-typedef struct RESULTS_S {
-	/* inputs */
-	ee_s16	seed1;		/* Initializing seed */
-	ee_s16	seed2;		/* Initializing seed */
-	ee_s16	seed3;		/* Initializing seed */
-	void	*memblock[4];	/* Pointer to safe memory location */
-	ee_u32	size;		/* Size of the data */
-	ee_u32 iterations;		/* Number of iterations to execute */
-	ee_u32	execs;		/* Bitmask of operations to execute */
-	struct list_head_s *list;
-	mat_params mat;
-	/* outputs */
-	ee_u16	crc;
-	ee_u16	crclist;
-	ee_u16	crcmatrix;
-	ee_u16	crcstate;
-	ee_s16	err;
-	/* ultithread specific */
-	core_portable port;
+typedef struct RESULTS_S
+{
+    /* inputs */
+    ee_s16              seed1;       /* Initializing seed */
+    ee_s16              seed2;       /* Initializing seed */
+    ee_s16              seed3;       /* Initializing seed */
+    void *              memblock[4]; /* Pointer to safe memory location */
+    ee_u32              size;        /* Size of the data */
+    ee_u32              iterations;  /* Number of iterations to execute */
+    ee_u32              execs;       /* Bitmask of operations to execute */
+    struct list_head_s *list;
+    mat_params          mat;
+    /* outputs */
+    ee_u16 crc;
+    ee_u16 crclist;
+    ee_u16 crcmatrix;
+    ee_u16 crcstate;
+    ee_s16 err;
+    /* ultithread specific */
+    core_portable port;
 } core_results;

 /* Multicore execution handling */
-#if (MULTITHREAD>1)
+#if (MULTITHREAD > 1)
 ee_u8 core_start_parallel(core_results *res);
 ee_u8 core_stop_parallel(core_results *res);
 #endif

 /* list benchmark functions */
 list_head *core_list_init(ee_u32 blksize, list_head *memblock, ee_s16 seed);
-ee_u16 core_bench_list(core_results *res, ee_s16 finder_idx);
+ee_u16     core_bench_list(core_results *res, ee_s16 finder_idx);

 /* state benchmark functions */
-void core_init_state(ee_u32 size, ee_s16 seed, ee_u8 *p);
-ee_u16 core_bench_state(ee_u32 blksize, ee_u8 *memblock, 
-		ee_s16 seed1, ee_s16 seed2, ee_s16 step, ee_u16 crc);
+void   core_init_state(ee_u32 size, ee_s16 seed, ee_u8 *p);
+ee_u16 core_bench_state(ee_u32 blksize,
+                        ee_u8 *memblock,
+                        ee_s16 seed1,
+                        ee_s16 seed2,
+                        ee_s16 step,
+                        ee_u16 crc);

 /* matrix benchmark functions */
-ee_u32 core_init_matrix(ee_u32 blksize, void *memblk, ee_s32 seed, mat_params *p);
+ee_u32 core_init_matrix(ee_u32      blksize,
+                        void *      memblk,
+                        ee_s32      seed,
+                        mat_params *p);
 ee_u16 core_bench_matrix(mat_params *p, ee_s16 seed, ee_u16 crc);
-
--- a/riscv-coremark/coremark/coremark.md5
+++ b/riscv-coremark/coremark/coremark.md5
@ -0,0 +1,6 @@
+8d082dc4a9676c02731a8cf209339072  core_list_join.c
+c984863b84b59185d8b5fb81c1ca7535  core_main.c
+5fa21a0f7c3964167c9691db531ca652  core_matrix.c
+edcfc7a0b146a50028014f06e6826aa3  core_state.c
+45540ba2145adea1ec7ea2c72a1fbbcb  core_util.c
+8ca974c013b380dc7f0d6d1afb76eb2d  coremark.h
--- a/riscv-coremark/coremark/cygwin/core_portme.mak
+++ b/riscv-coremark/coremark/cygwin/core_portme.mak
@ -14,128 +14,4 @@
 # 
 # Original Author: Shay Gal-on

-#File: core_portme.mak
-
-# Flag: OUTFLAG
-#	Use this flag to define how to to get an executable (e.g -o)
-OUTFLAG= -o
-# Flag: CC
-#	Use this flag to define compiler to use
-CC = gcc
-# Flag: CFLAGS
-#	Use this flag to define compiler options. Note, you can add compiler options from the command line using XCFLAGS="other flags"
-PORT_CFLAGS = -O2
-FLAGS_STR = "$(PORT_CFLAGS) $(XCFLAGS) $(XLFLAGS) $(LFLAGS_END)"
-CFLAGS = $(PORT_CFLAGS) -I$(PORT_DIR) -I. -DFLAGS_STR=\"$(FLAGS_STR)\"
-#Flag: LFLAGS_END
-#	Define any libraries needed for linking or other flags that should come at the end of the link line (e.g. linker scripts). 
-#	Note: On certain platforms, the default clock_gettime implementation is supported but requires linking of librt.
-LFLAGS_END = 
-# Flag: PORT_SRCS
-# Port specific source files can be added here
-PORT_SRCS = $(PORT_DIR)/core_portme.c
-# Flag: LOAD
-#	Define this flag if you need to load to a target, as in a cross compile environment.
-
-# Flag: RUN
-#	Define this flag if running does not consist of simple invocation of the binary.
-#	In a cross compile environment, you need to define this.
-
-#For flashing and using a tera term macro, you could use
-#LOAD = flash ADDR 
-#RUN =  ttpmacro coremark.ttl
-
-#For copying to target and executing via SSH connection, you could use
-#LOAD = scp $(OUTFILE)  user@target:~
-#RUN = ssh user@target -c  
-
-#For native compilation and execution
-LOAD = echo Loading done
-RUN = 
-
-OEXT = .o
-EXE = .exe
-
-# Flag: SEPARATE_COMPILE
-# Define if you need to separate compilation from link stage. 
-# In this case, you also need to define below how to create an object file, and how to link.
-ifdef SEPARATE_COMPILE
-
-LD		= gcc
-OBJOUT 	= -o
-LFLAGS 	=
-OFLAG 	= -o
-COUT 	= -c
-# Flag: PORT_OBJS
-# Port specific object files can be added here
-PORT_OBJS = $(PORT_DIR)/core_portme$(OEXT)
-PORT_CLEAN = *$(OEXT)
-
-$(OPATH)%$(OEXT) : %.c
-	$(CC) $(CFLAGS) $(XCFLAGS) $(COUT) $< $(OBJOUT) $@
-	
-endif
-
-# Target: port_prebuild
-# Generate any files that are needed before actual build starts.
-# E.g. generate profile guidance files. Sample PGO generation for gcc enabled with PGO=1
-#  - First, check if PGO was defined on the command line, if so, need to add -fprofile-use to compile line.
-#  - Second, if PGO reference has not yet been generated, add a step to the prebuild that will build a profile-generate version and run it.
-#  Note - Using REBUILD=1 
-#
-# Use make PGO=1 to invoke this sample processing.
-
-ifdef PGO
- ifeq (,$(findstring $(PGO),gen))
-  PGO_STAGE=build_pgo_gcc
-  CFLAGS+=-fprofile-use
- endif
- PORT_CLEAN+=*.gcda *.gcno gmon.out
-endif
-
-.PHONY: port_prebuild
-port_prebuild: $(PGO_STAGE)
-
-.PHONY: build_pgo_gcc
-build_pgo_gcc:
-	$(MAKE) PGO=gen XCFLAGS="$(XCFLAGS) -fprofile-generate -DTOTAL_DATA_SIZE=1200" ITERATIONS=10 gen_pgo_data REBUILD=1
-	
-# Target: port_postbuild
-# Generate any files that are needed after actual build end.
-# E.g. change format to srec, bin, zip in order to be able to load into flash
-.PHONY: port_postbuild
-port_postbuild:
-
-# Target: port_postrun
-# 	Do platform specific after run stuff. 
-#	E.g. reset the board, backup the logfiles etc.
-.PHONY: port_postrun
-port_postrun:
-
-# Target: port_prerun
-# 	Do platform specific after run stuff. 
-#	E.g. reset the board, backup the logfiles etc.
-.PHONY: port_prerun
-port_prerun:
-
-# Target: port_postload
-# 	Do platform specific after load stuff. 
-#	E.g. reset the reset power to the flash eraser
-.PHONY: port_postload
-port_postload:
-
-# Target: port_preload
-# 	Do platform specific before load stuff. 
-#	E.g. reset the reset power to the flash eraser
-.PHONY: port_preload
-port_preload:
-
-
-# FLAG: OPATH
-# Path to the output folder. Default - current folder.
-OPATH = ./
-MKDIR = mkdir -p
-
-# FLAG: PERL
-# Define perl executable to calculate the geomean if running separate.
-PERL=perl
+include posix/core_portme.mak
--- a/riscv-coremark/coremark/freebsd/core_portme.mak
+++ b/riscv-coremark/coremark/freebsd/core_portme.mak
@ -0,0 +1,17 @@
+# Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# Original Author: Shay Gal-on
+
+include posix/core_portme.mak
--- a/riscv-coremark/coremark/linux/core_portme.mak
+++ b/riscv-coremark/coremark/linux/core_portme.mak
@ -14,127 +14,4 @@
 # 
 # Original Author: Shay Gal-on

-#File: core_portme.mak
-
-# Flag: OUTFLAG
-#	Use this flag to define how to to get an executable (e.g -o)
-OUTFLAG= -o
-# Flag: CC
-#	Use this flag to define compiler to use
-CC = gcc
-# Flag: CFLAGS
-#	Use this flag to define compiler options. Note, you can add compiler options from the command line using XCFLAGS="other flags"
-PORT_CFLAGS = -O2
-FLAGS_STR = "$(PORT_CFLAGS) $(XCFLAGS) $(XLFLAGS) $(LFLAGS_END)"
-CFLAGS = $(PORT_CFLAGS) -I$(PORT_DIR) -I. -DFLAGS_STR=\"$(FLAGS_STR)\"
-#Flag: LFLAGS_END
-#	Define any libraries needed for linking or other flags that should come at the end of the link line (e.g. linker scripts). 
-#	Note: On certain platforms, the default clock_gettime implementation is supported but requires linking of librt.
-LFLAGS_END += -lrt
-# Flag: PORT_SRCS
-# Port specific source files can be added here
-PORT_SRCS = $(PORT_DIR)/core_portme.c
-# Flag: LOAD
-#	Define this flag if you need to load to a target, as in a cross compile environment.
-
-# Flag: RUN
-#	Define this flag if running does not consist of simple invocation of the binary.
-#	In a cross compile environment, you need to define this.
-
-#For flashing and using a tera term macro, you could use
-#LOAD = flash ADDR 
-#RUN =  ttpmacro coremark.ttl
-
-#For copying to target and executing via SSH connection, you could use
-#LOAD = scp $(OUTFILE)  user@target:~
-#RUN = ssh user@target -c  
-
-#For native compilation and execution
-LOAD = echo Loading done
-RUN = 
-
-OEXT = .o
-EXE = .exe
-
-# Flag: SEPARATE_COMPILE
-# Define if you need to separate compilation from link stage. 
-# In this case, you also need to define below how to create an object file, and how to link.
-ifdef SEPARATE_COMPILE
-
-LD		= gcc
-OBJOUT 	= -o
-LFLAGS 	=
-OFLAG 	= -o
-COUT 	= -c
-# Flag: PORT_OBJS
-# Port specific object files can be added here
-PORT_OBJS = $(PORT_DIR)/core_portme$(OEXT)
-PORT_CLEAN = *$(OEXT)
-
-$(OPATH)%$(OEXT) : %.c
-	$(CC) $(CFLAGS) $(XCFLAGS) $(COUT) $< $(OBJOUT) $@
-	
-endif
-
-# Target: port_prebuild
-# Generate any files that are needed before actual build starts.
-# E.g. generate profile guidance files. Sample PGO generation for gcc enabled with PGO=1
-#  - First, check if PGO was defined on the command line, if so, need to add -fprofile-use to compile line.
-#  - Second, if PGO reference has not yet been generated, add a step to the prebuild that will build a profile-generate version and run it.
-#  Note - Using REBUILD=1 
-#
-# Use make PGO=1 to invoke this sample processing.
-
-ifdef PGO
- ifeq (,$(findstring $(PGO),gen))
-  PGO_STAGE=build_pgo_gcc
-  CFLAGS+=-fprofile-use
- endif
- PORT_CLEAN+=*.gcda *.gcno gmon.out
-endif
-
-.PHONY: port_prebuild
-port_prebuild: $(PGO_STAGE)
-
-.PHONY: build_pgo_gcc
-build_pgo_gcc:
-	$(MAKE) PGO=gen XCFLAGS="$(XCFLAGS) -fprofile-generate -DTOTAL_DATA_SIZE=1200" ITERATIONS=10 gen_pgo_data REBUILD=1
-	
-# Target: port_postbuild
-# Generate any files that are needed after actual build end.
-# E.g. change format to srec, bin, zip in order to be able to load into flash
-.PHONY: port_postbuild
-port_postbuild:
-
-# Target: port_postrun
-# 	Do platform specific after run stuff. 
-#	E.g. reset the board, backup the logfiles etc.
-.PHONY: port_postrun
-port_postrun:
-
-# Target: port_prerun
-# 	Do platform specific after run stuff. 
-#	E.g. reset the board, backup the logfiles etc.
-.PHONY: port_prerun
-port_prerun:
-
-# Target: port_postload
-# 	Do platform specific after load stuff. 
-#	E.g. reset the reset power to the flash eraser
-.PHONY: port_postload
-port_postload:
-
-# Target: port_preload
-# 	Do platform specific before load stuff. 
-#	E.g. reset the reset power to the flash eraser
-.PHONY: port_preload
-port_preload:
-
-# FLAG: OPATH
-# Path to the output folder. Default - current folder.
-OPATH = ./
-MKDIR = mkdir -p
-
-# FLAG: PERL
-# Define perl executable to calculate the geomean if running separate.
-PERL=/usr/bin/perl
+include posix/core_portme.mak
--- a/riscv-coremark/coremark/macos/core_portme.mak
+++ b/riscv-coremark/coremark/macos/core_portme.mak
@ -0,0 +1,18 @@
+# Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# Original Author: Shay Gal-on
+
+NO_LIBRT = 1
+include posix/core_portme.mak
--- a/riscv-coremark/coremark/posix/core_portme.c
+++ b/riscv-coremark/coremark/posix/core_portme.c
@ -0,0 +1,419 @@
+/*
+Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Original Author: Shay Gal-on
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "coremark.h"
+#if CALLGRIND_RUN
+#include <valgrind/callgrind.h>
+#endif
+
+#if (MEM_METHOD == MEM_MALLOC)
+/* Function: portable_malloc
+        Provide malloc() functionality in a platform specific way.
+*/
+void *
+portable_malloc(size_t size)
+{
+    return malloc(size);
+}
+/* Function: portable_free
+        Provide free() functionality in a platform specific way.
+*/
+void
+portable_free(void *p)
+{
+    free(p);
+}
+#else
+void *
+portable_malloc(size_t size)
+{
+    return NULL;
+}
+void
+portable_free(void *p)
+{
+    p = NULL;
+}
+#endif
+
+#if (SEED_METHOD == SEED_VOLATILE)
+#if VALIDATION_RUN
+volatile ee_s32 seed1_volatile = 0x3415;
+volatile ee_s32 seed2_volatile = 0x3415;
+volatile ee_s32 seed3_volatile = 0x66;
+#endif
+#if PERFORMANCE_RUN
+volatile ee_s32 seed1_volatile = 0x0;
+volatile ee_s32 seed2_volatile = 0x0;
+volatile ee_s32 seed3_volatile = 0x66;
+#endif
+#if PROFILE_RUN
+volatile ee_s32 seed1_volatile = 0x8;
+volatile ee_s32 seed2_volatile = 0x8;
+volatile ee_s32 seed3_volatile = 0x8;
+#endif
+volatile ee_s32 seed4_volatile = ITERATIONS;
+volatile ee_s32 seed5_volatile = 0;
+#endif
+/* Porting: Timing functions
+        How to capture time and convert to seconds must be ported to whatever is
+   supported by the platform. e.g. Read value from on board RTC, read value from
+   cpu clock cycles performance counter etc. Sample implementation for standard
+   time.h and windows.h definitions included.
+*/
+/* Define: TIMER_RES_DIVIDER
+        Divider to trade off timer resolution and total time that can be
+   measured.
+
+        Use lower values to increase resolution, but make sure that overflow
+   does not occur. If there are issues with the return value overflowing,
+   increase this value.
+        */
+#if USE_CLOCK
+#define NSECS_PER_SEC              CLOCKS_PER_SEC
+#define EE_TIMER_TICKER_RATE       1000
+#define CORETIMETYPE               clock_t
+#define GETMYTIME(_t)              (*_t = clock())
+#define MYTIMEDIFF(fin, ini)       ((fin) - (ini))
+#define TIMER_RES_DIVIDER          1
+#define SAMPLE_TIME_IMPLEMENTATION 1
+#elif defined(_MSC_VER)
+#define NSECS_PER_SEC        10000000
+#define EE_TIMER_TICKER_RATE 1000
+#define CORETIMETYPE         FILETIME
+#define GETMYTIME(_t)        GetSystemTimeAsFileTime(_t)
+#define MYTIMEDIFF(fin, ini) \
+    (((*(__int64 *)&fin) - (*(__int64 *)&ini)) / TIMER_RES_DIVIDER)
+/* setting to millisces resolution by default with MSDEV */
+#ifndef TIMER_RES_DIVIDER
+#define TIMER_RES_DIVIDER 1000
+#endif
+#define SAMPLE_TIME_IMPLEMENTATION 1
+#elif HAS_TIME_H
+#define NSECS_PER_SEC        1000000000
+#define EE_TIMER_TICKER_RATE 1000
+#define CORETIMETYPE         struct timespec
+#define GETMYTIME(_t)        clock_gettime(CLOCK_REALTIME, _t)
+#define MYTIMEDIFF(fin, ini)                                         \
+    ((fin.tv_sec - ini.tv_sec) * (NSECS_PER_SEC / TIMER_RES_DIVIDER) \
+     + (fin.tv_nsec - ini.tv_nsec) / TIMER_RES_DIVIDER)
+/* setting to 1/1000 of a second resolution by default with linux */
+#ifndef TIMER_RES_DIVIDER
+#define TIMER_RES_DIVIDER 1000000
+#endif
+#define SAMPLE_TIME_IMPLEMENTATION 1
+#else
+#define SAMPLE_TIME_IMPLEMENTATION 0
+#endif
+#define EE_TICKS_PER_SEC (NSECS_PER_SEC / TIMER_RES_DIVIDER)
+
+#if SAMPLE_TIME_IMPLEMENTATION
+/** Define Host specific (POSIX), or target specific global time variables. */
+static CORETIMETYPE start_time_val, stop_time_val;
+
+/* Function: start_time
+        This function will be called right before starting the timed portion of
+   the benchmark.
+
+        Implementation may be capturing a system timer (as implemented in the
+   example code) or zeroing some system parameters - e.g. setting the cpu clocks
+   cycles to 0.
+*/
+void
+start_time(void)
+{
+    GETMYTIME(&start_time_val);
+#if CALLGRIND_RUN
+    CALLGRIND_START_INSTRUMENTATION
+#endif
+#if MICA
+    asm volatile("int3"); /*1 */
+#endif
+}
+/* Function: stop_time
+        This function will be called right after ending the timed portion of the
+   benchmark.
+
+        Implementation may be capturing a system timer (as implemented in the
+   example code) or other system parameters - e.g. reading the current value of
+   cpu cycles counter.
+*/
+void
+stop_time(void)
+{
+#if CALLGRIND_RUN
+    CALLGRIND_STOP_INSTRUMENTATION
+#endif
+#if MICA
+    asm volatile("int3"); /*1 */
+#endif
+    GETMYTIME(&stop_time_val);
+}
+/* Function: get_time
+        Return an abstract "ticks" number that signifies time on the system.
+
+        Actual value returned may be cpu cycles, milliseconds or any other
+   value, as long as it can be converted to seconds by <time_in_secs>. This
+   methodology is taken to accomodate any hardware or simulated platform. The
+   sample implementation returns millisecs by default, and the resolution is
+   controlled by <TIMER_RES_DIVIDER>
+*/
+CORE_TICKS
+get_time(void)
+{
+    CORE_TICKS elapsed
+        = (CORE_TICKS)(MYTIMEDIFF(stop_time_val, start_time_val));
+    return elapsed;
+}
+/* Function: time_in_secs
+        Convert the value returned by get_time to seconds.
+
+        The <secs_ret> type is used to accomodate systems with no support for
+   floating point. Default implementation implemented by the EE_TICKS_PER_SEC
+   macro above.
+*/
+secs_ret
+time_in_secs(CORE_TICKS ticks)
+{
+    secs_ret retval = ((secs_ret)ticks) / (secs_ret)EE_TICKS_PER_SEC;
+    return retval;
+}
+#else
+#error "Please implement timing functionality in core_portme.c"
+#endif /* SAMPLE_TIME_IMPLEMENTATION */
+
+ee_u32 default_num_contexts = MULTITHREAD;
+
+/* Function: portable_init
+        Target specific initialization code
+        Test for some common mistakes.
+*/
+void
+portable_init(core_portable *p, int *argc, char *argv[])
+{
+#if PRINT_ARGS
+    int i;
+    for (i = 0; i < *argc; i++)
+    {
+        ee_printf("Arg[%d]=%s\n", i, argv[i]);
+    }
+#endif
+    if (sizeof(ee_ptr_int) != sizeof(ee_u8 *))
+    {
+        ee_printf(
+            "ERROR! Please define ee_ptr_int to a type that holds a "
+            "pointer!\n");
+    }
+    if (sizeof(ee_u32) != 4)
+    {
+        ee_printf("ERROR! Please define ee_u32 to a 32b unsigned type!\n");
+    }
+#if (MAIN_HAS_NOARGC && (SEED_METHOD == SEED_ARG))
+    ee_printf(
+        "ERROR! Main has no argc, but SEED_METHOD defined to SEED_ARG!\n");
+#endif
+
+#if (MULTITHREAD > 1) && (SEED_METHOD == SEED_ARG)
+    int nargs = *argc, i;
+    if ((nargs > 1) && (*argv[1] == 'M'))
+    {
+        default_num_contexts = parseval(argv[1] + 1);
+        if (default_num_contexts > MULTITHREAD)
+            default_num_contexts = MULTITHREAD;
+        /* Shift args since first arg is directed to the portable part and not
+         * to coremark main */
+        --nargs;
+        for (i = 1; i < nargs; i++)
+            argv[i] = argv[i + 1];
+        *argc = nargs;
+    }
+#endif /* sample of potential platform specific init via command line, reset \
+          the number of contexts being used if first argument is M<n>*/
+    p->portable_id = 1;
+}
+/* Function: portable_fini
+        Target specific final code
+*/
+void
+portable_fini(core_portable *p)
+{
+    p->portable_id = 0;
+}
+
+#if (MULTITHREAD > 1)
+
+/* Function: core_start_parallel
+        Start benchmarking in a parallel context.
+
+        Three implementations are provided, one using pthreads, one using fork
+   and shared mem, and one using fork and sockets. Other implementations using
+   MCAPI or other standards can easily be devised.
+*/
+/* Function: core_stop_parallel
+        Stop a parallel context execution of coremark, and gather the results.
+
+        Three implementations are provided, one using pthreads, one using fork
+   and shared mem, and one using fork and sockets. Other implementations using
+   MCAPI or other standards can easily be devised.
+*/
+#if USE_PTHREAD
+ee_u8
+core_start_parallel(core_results *res)
+{
+    return (ee_u8)pthread_create(
+        &(res->port.thread), NULL, iterate, (void *)res);
+}
+ee_u8
+core_stop_parallel(core_results *res)
+{
+    void *retval;
+    return (ee_u8)pthread_join(res->port.thread, &retval);
+}
+#elif USE_FORK
+static int key_id = 0;
+ee_u8
+core_start_parallel(core_results *res)
+{
+    key_t key = 4321 + key_id;
+    key_id++;
+    res->port.pid   = fork();
+    res->port.shmid = shmget(key, 8, IPC_CREAT | 0666);
+    if (res->port.shmid < 0)
+    {
+        ee_printf("ERROR in shmget!\n");
+    }
+    if (res->port.pid == 0)
+    {
+        iterate(res);
+        res->port.shm = shmat(res->port.shmid, NULL, 0);
+        /* copy the validation values to the shared memory area  and quit*/
+        if (res->port.shm == (char *)-1)
+        {
+            ee_printf("ERROR in child shmat!\n");
+        }
+        else
+        {
+            memcpy(res->port.shm, &(res->crc), 8);
+            shmdt(res->port.shm);
+        }
+        exit(0);
+    }
+    return 1;
+}
+ee_u8
+core_stop_parallel(core_results *res)
+{
+    int   status;
+    pid_t wpid = waitpid(res->port.pid, &status, WUNTRACED);
+    if (wpid != res->port.pid)
+    {
+        ee_printf("ERROR waiting for child.\n");
+        if (errno == ECHILD)
+            ee_printf("errno=No such child %d\n", res->port.pid);
+        if (errno == EINTR)
+            ee_printf("errno=Interrupted\n");
+        return 0;
+    }
+    /* after process is done, get the values from the shared memory area */
+    res->port.shm = shmat(res->port.shmid, NULL, 0);
+    if (res->port.shm == (char *)-1)
+    {
+        ee_printf("ERROR in parent shmat!\n");
+        return 0;
+    }
+    memcpy(&(res->crc), res->port.shm, 8);
+    shmdt(res->port.shm);
+    return 1;
+}
+#elif USE_SOCKET
+static int key_id = 0;
+ee_u8
+core_start_parallel(core_results *res)
+{
+    int bound, buffer_length = 8;
+    res->port.sa.sin_family      = AF_INET;
+    res->port.sa.sin_addr.s_addr = htonl(0x7F000001);
+    res->port.sa.sin_port        = htons(7654 + key_id);
+    key_id++;
+    res->port.pid = fork();
+    if (res->port.pid == 0)
+    { /* benchmark child */
+        iterate(res);
+        res->port.sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP);
+        if (-1 == res->port.sock) /* if socket failed to initialize, exit */
+        {
+            ee_printf("Error Creating Socket");
+        }
+        else
+        {
+            int bytes_sent = sendto(res->port.sock,
+                                    &(res->crc),
+                                    buffer_length,
+                                    0,
+                                    (struct sockaddr *)&(res->port.sa),
+                                    sizeof(struct sockaddr_in));
+            if (bytes_sent < 0)
+                ee_printf("Error sending packet: %s\n", strerror(errno));
+            close(res->port.sock); /* close the socket */
+        }
+        exit(0);
+    }
+    /* parent process, open the socket */
+    res->port.sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP);
+    bound          = bind(res->port.sock,
+                 (struct sockaddr *)&(res->port.sa),
+                 sizeof(struct sockaddr));
+    if (bound < 0)
+        ee_printf("bind(): %s\n", strerror(errno));
+    return 1;
+}
+ee_u8
+core_stop_parallel(core_results *res)
+{
+    int status;
+    int fromlen = sizeof(struct sockaddr);
+    int recsize = recvfrom(res->port.sock,
+                           &(res->crc),
+                           8,
+                           0,
+                           (struct sockaddr *)&(res->port.sa),
+                           &fromlen);
+    if (recsize < 0)
+    {
+        ee_printf("Error in receive: %s\n", strerror(errno));
+        return 0;
+    }
+    pid_t wpid = waitpid(res->port.pid, &status, WUNTRACED);
+    if (wpid != res->port.pid)
+    {
+        ee_printf("ERROR waiting for child.\n");
+        if (errno == ECHILD)
+            ee_printf("errno=No such child %d\n", res->port.pid);
+        if (errno == EINTR)
+            ee_printf("errno=Interrupted\n");
+        return 0;
+    }
+    return 1;
+}
+#else /* no standard multicore implementation */
+#error \
+    "Please implement multicore functionality in core_portme.c to use multiple contexts."
+#endif /* multithread implementations */
+#endif
--- a/riscv-coremark/coremark/posix/core_portme.h
+++ b/riscv-coremark/coremark/posix/core_portme.h
@ -0,0 +1,314 @@
+/*
+Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Original Author: Shay Gal-on
+*/
+
+/* Topic: Description
+        This file contains configuration constants required to execute on
+   different platforms
+*/
+#ifndef CORE_PORTME_H
+#define CORE_PORTME_H
+
+#include "core_portme_posix_overrides.h"
+
+/************************/
+/* Data types and settings */
+/************************/
+/* Configuration: HAS_FLOAT
+        Define to 1 if the platform supports floating point.
+*/
+#ifndef HAS_FLOAT
+#define HAS_FLOAT 1
+#endif
+/* Configuration: HAS_TIME_H
+        Define to 1 if platform has the time.h header file,
+        and implementation of functions thereof.
+*/
+#ifndef HAS_TIME_H
+#define HAS_TIME_H 1
+#endif
+/* Configuration: USE_CLOCK
+        Define to 1 if platform has the time.h header file,
+        and implementation of functions thereof.
+*/
+#ifndef USE_CLOCK
+#define USE_CLOCK 0
+#endif
+/* Configuration: HAS_STDIO
+        Define to 1 if the platform has stdio.h.
+*/
+#ifndef HAS_STDIO
+#define HAS_STDIO 1
+#endif
+/* Configuration: HAS_PRINTF
+        Define to 1 if the platform has stdio.h and implements the printf
+   function.
+*/
+#ifndef HAS_PRINTF
+#define HAS_PRINTF 1
+#endif
+
+/* Configuration: CORE_TICKS
+        Define type of return from the timing functions.
+ */
+#if defined(_MSC_VER)
+#include <windows.h>
+typedef size_t CORE_TICKS;
+#elif HAS_TIME_H
+#include <time.h>
+typedef clock_t CORE_TICKS;
+#else
+#error \
+    "Please define type of CORE_TICKS and implement start_time, end_time get_time and time_in_secs functions!"
+#endif
+
+/* Definitions: COMPILER_VERSION, COMPILER_FLAGS, MEM_LOCATION
+        Initialize these strings per platform
+*/
+#ifndef COMPILER_VERSION
+#ifdef __GNUC__
+#define COMPILER_VERSION "GCC"__VERSION__
+#else
+#define COMPILER_VERSION "Please put compiler version here (e.g. gcc 4.1)"
+#endif
+#endif
+#ifndef COMPILER_FLAGS
+#define COMPILER_FLAGS \
+    FLAGS_STR /* "Please put compiler flags here (e.g. -o3)" */
+#endif
+#ifndef MEM_LOCATION
+#define MEM_LOCATION                                                         \
+    "Please put data memory location here\n\t\t\t(e.g. code in flash, data " \
+    "on heap etc)"
+#define MEM_LOCATION_UNSPEC 1
+#endif
+
+#include <stdint.h>
+
+/* Data Types:
+        To avoid compiler issues, define the data types that need ot be used for
+   8b, 16b and 32b in <core_portme.h>.
+
+        *Imprtant*:
+        ee_ptr_int needs to be the data type used to hold pointers, otherwise
+   coremark may fail!!!
+*/
+typedef signed short   ee_s16;
+typedef unsigned short ee_u16;
+typedef signed int     ee_s32;
+typedef double         ee_f32;
+typedef unsigned char  ee_u8;
+typedef unsigned int   ee_u32;
+typedef uintptr_t      ee_ptr_int;
+typedef size_t         ee_size_t;
+/* align an offset to point to a 32b value */
+#define align_mem(x) (void *)(4 + (((ee_ptr_int)(x)-1) & ~3))
+
+/* Configuration: SEED_METHOD
+        Defines method to get seed values that cannot be computed at compile
+   time.
+
+        Valid values:
+        SEED_ARG - from command line.
+        SEED_FUNC - from a system function.
+        SEED_VOLATILE - from volatile variables.
+*/
+#ifndef SEED_METHOD
+#define SEED_METHOD SEED_ARG
+#endif
+
+/* Configuration: MEM_METHOD
+        Defines method to get a block of memry.
+
+        Valid values:
+        MEM_MALLOC - for platforms that implement malloc and have malloc.h.
+        MEM_STATIC - to use a static memory array.
+        MEM_STACK - to allocate the data block on the stack (NYI).
+*/
+#ifndef MEM_METHOD
+#define MEM_METHOD MEM_MALLOC
+#endif
+
+/* Configuration: MULTITHREAD
+        Define for parallel execution
+
+        Valid values:
+        1 - only one context (default).
+        N>1 - will execute N copies in parallel.
+
+        Note:
+        If this flag is defined to more then 1, an implementation for launching
+   parallel contexts must be defined.
+
+        Two sample implementations are provided. Use <USE_PTHREAD> or <USE_FORK>
+   to enable them.
+
+        It is valid to have a different implementation of <core_start_parallel>
+   and <core_end_parallel> in <core_portme.c>, to fit a particular architecture.
+*/
+#ifndef MULTITHREAD
+#define MULTITHREAD 1
+#endif
+
+/* Configuration: USE_PTHREAD
+        Sample implementation for launching parallel contexts
+        This implementation uses pthread_thread_create and pthread_join.
+
+        Valid values:
+        0 - Do not use pthreads API.
+        1 - Use pthreads API
+
+        Note:
+        This flag only matters if MULTITHREAD has been defined to a value
+   greater then 1.
+*/
+#ifndef USE_PTHREAD
+#define USE_PTHREAD 0
+#endif
+
+/* Configuration: USE_FORK
+        Sample implementation for launching parallel contexts
+        This implementation uses fork, waitpid, shmget,shmat and shmdt.
+
+        Valid values:
+        0 - Do not use fork API.
+        1 - Use fork API
+
+        Note:
+        This flag only matters if MULTITHREAD has been defined to a value
+   greater then 1.
+*/
+#ifndef USE_FORK
+#define USE_FORK 0
+#endif
+
+/* Configuration: USE_SOCKET
+        Sample implementation for launching parallel contexts
+        This implementation uses fork, socket, sendto and recvfrom
+
+        Valid values:
+        0 - Do not use fork and sockets API.
+        1 - Use fork and sockets API
+
+        Note:
+        This flag only matters if MULTITHREAD has been defined to a value
+   greater then 1.
+*/
+#ifndef USE_SOCKET
+#define USE_SOCKET 0
+#endif
+
+/* Configuration: MAIN_HAS_NOARGC
+        Needed if platform does not support getting arguments to main.
+
+        Valid values:
+        0 - argc/argv to main is supported
+        1 - argc/argv to main is not supported
+*/
+#ifndef MAIN_HAS_NOARGC
+#define MAIN_HAS_NOARGC 0
+#endif
+
+/* Configuration: MAIN_HAS_NORETURN
+        Needed if platform does not support returning a value from main.
+
+        Valid values:
+        0 - main returns an int, and return value will be 0.
+        1 - platform does not support returning a value from main
+*/
+#ifndef MAIN_HAS_NORETURN
+#define MAIN_HAS_NORETURN 0
+#endif
+
+/* Variable: default_num_contexts
+        Number of contexts to spawn in multicore context.
+        Override this global value to change number of contexts used.
+
+        Note:
+        This value may not be set higher then the <MULTITHREAD> define.
+
+        To experiment, you can set the <MULTITHREAD> define to the highest value
+   expected, and use argc/argv in the <portable_init> to set this value from the
+   command line.
+*/
+extern ee_u32 default_num_contexts;
+
+#if (MULTITHREAD > 1)
+#if USE_PTHREAD
+#include <pthread.h>
+#define PARALLEL_METHOD "PThreads"
+#elif USE_FORK
+#include <unistd.h>
+#include <errno.h>
+#include <sys/wait.h>
+#include <sys/shm.h>
+#include <string.h> /* for memcpy */
+#define PARALLEL_METHOD "Fork"
+#elif USE_SOCKET
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <sys/wait.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#define PARALLEL_METHOD "Sockets"
+#else
+#define PARALLEL_METHOD "Proprietary"
+#error \
+    "Please implement multicore functionality in core_portme.c to use multiple contexts."
+#endif /* Method for multithreading */
+#endif /* MULTITHREAD > 1 */
+
+typedef struct CORE_PORTABLE_S
+{
+#if (MULTITHREAD > 1)
+#if USE_PTHREAD
+    pthread_t thread;
+#elif USE_FORK
+    pid_t pid;
+    int   shmid;
+    void *shm;
+#elif USE_SOCKET
+    pid_t              pid;
+    int                sock;
+    struct sockaddr_in sa;
+#endif /* Method for multithreading */
+#endif /* MULTITHREAD>1 */
+    ee_u8 portable_id;
+} core_portable;
+
+/* target specific init/fini */
+void portable_init(core_portable *p, int *argc, char *argv[]);
+void portable_fini(core_portable *p);
+
+#if (SEED_METHOD == SEED_VOLATILE)
+#if (VALIDATION_RUN || PERFORMANCE_RUN || PROFILE_RUN)
+#define RUN_TYPE_FLAG 1
+#else
+#if (TOTAL_DATA_SIZE == 1200)
+#define PROFILE_RUN 1
+#else
+#define PERFORMANCE_RUN 1
+#endif
+#endif
+#endif /* SEED_METHOD==SEED_VOLATILE */
+
+#endif /* CORE_PORTME_H */
--- a/riscv-coremark/coremark/posix/core_portme.mak
+++ b/riscv-coremark/coremark/posix/core_portme.mak
@ -0,0 +1,151 @@
+# Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# Original Author: Shay Gal-on
+
+#File: core_portme.mak
+
+# Flag: OUTFLAG
+#	Use this flag to define how to to get an executable (e.g -o)
+OUTFLAG= -o
+# Flag: CC
+#	Use this flag to define compiler to use
+CC?= cc
+# Flag: CFLAGS
+#	Use this flag to define compiler options. Note, you can add compiler options from the command line using XCFLAGS="other flags"
+PORT_CFLAGS = -O2
+FLAGS_STR = "$(PORT_CFLAGS) $(XCFLAGS) $(XLFLAGS) $(LFLAGS_END)"
+CFLAGS = $(PORT_CFLAGS) -I$(PORT_DIR) -Iposix -I. -DFLAGS_STR=\"$(FLAGS_STR)\"
+# Flag: NO_LIBRT
+#	Define if the platform does not provide a librt
+ifndef NO_LIBRT
+#Flag: LFLAGS_END
+#	Define any libraries needed for linking or other flags that should come at the end of the link line (e.g. linker scripts). 
+#	Note: On certain platforms, the default clock_gettime implementation is supported but requires linking of librt.
+LFLAGS_END += -lrt
+endif
+# Flag: PORT_SRCS
+#	Port specific source files can be added here
+PORT_SRCS = posix/core_portme.c
+vpath %.c posix
+vpath %.h posix
+vpath %.mak posix
+# Flag: EXTRA_DEPENDS
+#	Port specific extra build dependencies.
+#	Some ports inherit from us, so ensure this Makefile is always a dependency.
+EXTRA_DEPENDS += posix/core_portme.mak
+# Flag: LOAD
+#	Define this flag if you need to load to a target, as in a cross compile environment.
+
+# Flag: RUN
+#	Define this flag if running does not consist of simple invocation of the binary.
+#	In a cross compile environment, you need to define this.
+
+#For flashing and using a tera term macro, you could use
+#LOAD = flash ADDR 
+#RUN =  ttpmacro coremark.ttl
+
+#For copying to target and executing via SSH connection, you could use
+#LOAD = scp $(OUTFILE)  user@target:~
+#RUN = ssh user@target -c  
+
+#For native compilation and execution
+LOAD = echo Loading done
+RUN = 
+
+OEXT = .o
+EXE = .exe
+
+# Flag: SEPARATE_COMPILE
+# Define if you need to separate compilation from link stage. 
+# In this case, you also need to define below how to create an object file, and how to link.
+ifdef SEPARATE_COMPILE
+
+LD		= gcc
+OBJOUT 	= -o
+LFLAGS 	=
+OFLAG 	= -o
+COUT 	= -c
+# Flag: PORT_OBJS
+# Port specific object files can be added here
+PORT_OBJS = $(PORT_DIR)/core_portme$(OEXT)
+PORT_CLEAN = *$(OEXT)
+
+$(OPATH)%$(OEXT) : %.c
+	$(CC) $(CFLAGS) $(XCFLAGS) $(COUT) $< $(OBJOUT) $@
+	
+endif
+
+# Target: port_prebuild
+# Generate any files that are needed before actual build starts.
+# E.g. generate profile guidance files. Sample PGO generation for gcc enabled with PGO=1
+#  - First, check if PGO was defined on the command line, if so, need to add -fprofile-use to compile line.
+#  - Second, if PGO reference has not yet been generated, add a step to the prebuild that will build a profile-generate version and run it.
+#  Note - Using REBUILD=1 
+#
+# Use make PGO=1 to invoke this sample processing.
+
+ifdef PGO
+ ifeq (,$(findstring $(PGO),gen))
+  PGO_STAGE=build_pgo_gcc
+  CFLAGS+=-fprofile-use
+ endif
+ PORT_CLEAN+=*.gcda *.gcno gmon.out
+endif
+
+.PHONY: port_prebuild
+port_prebuild: $(PGO_STAGE)
+
+.PHONY: build_pgo_gcc
+build_pgo_gcc:
+	$(MAKE) PGO=gen XCFLAGS="$(XCFLAGS) -fprofile-generate -DTOTAL_DATA_SIZE=1200" ITERATIONS=10 gen_pgo_data REBUILD=1
+	
+# Target: port_postbuild
+# Generate any files that are needed after actual build end.
+# E.g. change format to srec, bin, zip in order to be able to load into flash
+.PHONY: port_postbuild
+port_postbuild:
+
+# Target: port_postrun
+# 	Do platform specific after run stuff. 
+#	E.g. reset the board, backup the logfiles etc.
+.PHONY: port_postrun
+port_postrun:
+
+# Target: port_prerun
+# 	Do platform specific after run stuff. 
+#	E.g. reset the board, backup the logfiles etc.
+.PHONY: port_prerun
+port_prerun:
+
+# Target: port_postload
+# 	Do platform specific after load stuff. 
+#	E.g. reset the reset power to the flash eraser
+.PHONY: port_postload
+port_postload:
+
+# Target: port_preload
+# 	Do platform specific before load stuff. 
+#	E.g. reset the reset power to the flash eraser
+.PHONY: port_preload
+port_preload:
+
+# FLAG: OPATH
+# Path to the output folder. Default - current folder.
+OPATH = ./
+MKDIR = mkdir -p
+
+# FLAG: PERL
+# Define perl executable to calculate the geomean if running separate.
+PERL=/usr/bin/perl
--- a/riscv-coremark/coremark/posix/core_portme_posix_overrides.h
+++ b/riscv-coremark/coremark/posix/core_portme_posix_overrides.h
@ -0,0 +1,28 @@
+/*
+Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Original Author: Shay Gal-on
+*/
+
+/* Topic: Description
+        This file contains additional configuration constants required to execute on
+   different platforms over and above the POSIX defaults
+*/
+#ifndef CORE_PORTME_POSIX_OVERRIDES_H
+#define CORE_PORTME_POSIX_OVERRIDES_H
+
+/* None by default */
+
+#endif
--- a/riscv-coremark/coremark/rtems/core_portme.mak
+++ b/riscv-coremark/coremark/rtems/core_portme.mak
@ -0,0 +1,18 @@
+# Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# Original Author: Shay Gal-on
+
+NO_LIBRT = 1
+include posix/core_portme.mak
--- a/riscv-coremark/coremark/rtems/init.c
+++ b/riscv-coremark/coremark/rtems/init.c
@ -0,0 +1,63 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2021 Hesham Almatary
+ *
+ * This software was developed by SRI International and the University of
+ * Cambridge Computer Laboratory (Department of Computer Science and
+ * Technology) under DARPA contract HR0011-18-C-0016 ("ECATS"), as part of the
+ * DARPA SSITH research programme.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <stdlib.h>
+#include <bsp.h>
+
+int main(
+  int argc,
+  void **args
+);
+
+rtems_task Init(
+  rtems_task_argument ignored
+);
+
+rtems_task Init(
+  rtems_task_argument ignored
+)
+{
+  int ret = main(0, NULL);
+  exit(ret);
+}
+
+/* configuration information */
+#define CONFIGURE_APPLICATION_NEEDS_SIMPLE_CONSOLE_DRIVER
+#define CONFIGURE_APPLICATION_NEEDS_CLOCK_DRIVER
+
+#define CONFIGURE_MAXIMUM_TASKS 20
+
+#define CONFIGURE_RTEMS_INIT_TASKS_TABLE
+
+#define CONFIGURE_INIT
+
+#include <rtems/confdefs.h>
--- a/riscv-coremark/coremark/simple/core_portme.c
+++ b/riscv-coremark/coremark/simple/core_portme.c
@ -21,108 +21,129 @@ Original Author: Shay Gal-on
 #include "coremark.h"

 #if VALIDATION_RUN
-	volatile ee_s32 seed1_volatile=0x3415;
-	volatile ee_s32 seed2_volatile=0x3415;
-	volatile ee_s32 seed3_volatile=0x66;
+volatile ee_s32 seed1_volatile = 0x3415;
+volatile ee_s32 seed2_volatile = 0x3415;
+volatile ee_s32 seed3_volatile = 0x66;
 #endif
 #if PERFORMANCE_RUN
-	volatile ee_s32 seed1_volatile=0x0;
-	volatile ee_s32 seed2_volatile=0x0;
-	volatile ee_s32 seed3_volatile=0x66;
+volatile ee_s32 seed1_volatile = 0x0;
+volatile ee_s32 seed2_volatile = 0x0;
+volatile ee_s32 seed3_volatile = 0x66;
 #endif
 #if PROFILE_RUN
-	volatile ee_s32 seed1_volatile=0x8;
-	volatile ee_s32 seed2_volatile=0x8;
-	volatile ee_s32 seed3_volatile=0x8;
+volatile ee_s32 seed1_volatile = 0x8;
+volatile ee_s32 seed2_volatile = 0x8;
+volatile ee_s32 seed3_volatile = 0x8;
 #endif
-	volatile ee_s32 seed4_volatile=ITERATIONS;
-	volatile ee_s32 seed5_volatile=0;
+volatile ee_s32 seed4_volatile = ITERATIONS;
+volatile ee_s32 seed5_volatile = 0;
 /* Porting : Timing functions
-	How to capture time and convert to seconds must be ported to whatever is supported by the platform.
-	e.g. Read value from on board RTC, read value from cpu clock cycles performance counter etc. 
-	Sample implementation for standard time.h and windows.h definitions included.
+        How to capture time and convert to seconds must be ported to whatever is
+   supported by the platform. e.g. Read value from on board RTC, read value from
+   cpu clock cycles performance counter etc. Sample implementation for standard
+   time.h and windows.h definitions included.
 */
 /* Define : TIMER_RES_DIVIDER
-	Divider to trade off timer resolution and total time that can be measured.
+        Divider to trade off timer resolution and total time that can be
+   measured.

-	Use lower values to increase resolution, but make sure that overflow does not occur.
-	If there are issues with the return value overflowing, increase this value.
-	*/
-#define NSECS_PER_SEC CLOCKS_PER_SEC
-#define CORETIMETYPE clock_t 
-#define GETMYTIME(_t) (*_t=clock())
-#define MYTIMEDIFF(fin,ini) ((fin)-(ini))
-#define TIMER_RES_DIVIDER 1
+        Use lower values to increase resolution, but make sure that overflow
+   does not occur. If there are issues with the return value overflowing,
+   increase this value.
+        */
+#define NSECS_PER_SEC              CLOCKS_PER_SEC
+#define CORETIMETYPE               clock_t
+#define GETMYTIME(_t)              (*_t = clock())
+#define MYTIMEDIFF(fin, ini)       ((fin) - (ini))
+#define TIMER_RES_DIVIDER          1
 #define SAMPLE_TIME_IMPLEMENTATION 1
-#define EE_TICKS_PER_SEC (NSECS_PER_SEC / TIMER_RES_DIVIDER)
+#define EE_TICKS_PER_SEC           (NSECS_PER_SEC / TIMER_RES_DIVIDER)

 /** Define Host specific (POSIX), or target specific global time variables. */
 static CORETIMETYPE start_time_val, stop_time_val;

 /* Function : start_time
-	This function will be called right before starting the timed portion of the benchmark.
+        This function will be called right before starting the timed portion of
+   the benchmark.

-	Implementation may be capturing a system timer (as implemented in the example code) 
-	or zeroing some system parameters - e.g. setting the cpu clocks cycles to 0.
+        Implementation may be capturing a system timer (as implemented in the
+   example code) or zeroing some system parameters - e.g. setting the cpu clocks
+   cycles to 0.
 */
-void start_time(void) {
-	GETMYTIME(&start_time_val );      
+void
+start_time(void)
+{
+    GETMYTIME(&start_time_val);
 }
 /* Function : stop_time
-	This function will be called right after ending the timed portion of the benchmark.
+        This function will be called right after ending the timed portion of the
+   benchmark.

-	Implementation may be capturing a system timer (as implemented in the example code) 
-	or other system parameters - e.g. reading the current value of cpu cycles counter.
+        Implementation may be capturing a system timer (as implemented in the
+   example code) or other system parameters - e.g. reading the current value of
+   cpu cycles counter.
 */
-void stop_time(void) {
-	GETMYTIME(&stop_time_val );      
+void
+stop_time(void)
+{
+    GETMYTIME(&stop_time_val);
 }
 /* Function : get_time
-	Return an abstract "ticks" number that signifies time on the system.
-	
-	Actual value returned may be cpu cycles, milliseconds or any other value,
-	as long as it can be converted to seconds by <time_in_secs>.
-	This methodology is taken to accomodate any hardware or simulated platform.
-	The sample implementation returns millisecs by default, 
-	and the resolution is controlled by <TIMER_RES_DIVIDER>
+        Return an abstract "ticks" number that signifies time on the system.
+
+        Actual value returned may be cpu cycles, milliseconds or any other
+   value, as long as it can be converted to seconds by <time_in_secs>. This
+   methodology is taken to accomodate any hardware or simulated platform. The
+   sample implementation returns millisecs by default, and the resolution is
+   controlled by <TIMER_RES_DIVIDER>
 */
-CORE_TICKS get_time(void) {
-	CORE_TICKS elapsed=(CORE_TICKS)(MYTIMEDIFF(stop_time_val, start_time_val));
-	return elapsed;
+CORE_TICKS
+get_time(void)
+{
+    CORE_TICKS elapsed
+        = (CORE_TICKS)(MYTIMEDIFF(stop_time_val, start_time_val));
+    return elapsed;
 }
 /* Function : time_in_secs
-	Convert the value returned by get_time to seconds.
+        Convert the value returned by get_time to seconds.

-	The <secs_ret> type is used to accomodate systems with no support for floating point.
-	Default implementation implemented by the EE_TICKS_PER_SEC macro above.
+        The <secs_ret> type is used to accomodate systems with no support for
+   floating point. Default implementation implemented by the EE_TICKS_PER_SEC
+   macro above.
 */
-secs_ret time_in_secs(CORE_TICKS ticks) {
-	secs_ret retval=((secs_ret)ticks) / (secs_ret)EE_TICKS_PER_SEC;
-	return retval;
+secs_ret
+time_in_secs(CORE_TICKS ticks)
+{
+    secs_ret retval = ((secs_ret)ticks) / (secs_ret)EE_TICKS_PER_SEC;
+    return retval;
 }

-ee_u32 default_num_contexts=1;
+ee_u32 default_num_contexts = 1;

 /* Function : portable_init
-	Target specific initialization code 
-	Test for some common mistakes.
+        Target specific initialization code
+        Test for some common mistakes.
 */
-void portable_init(core_portable *p, int *argc, char *argv[])
+void
+portable_init(core_portable *p, int *argc, char *argv[])
 {
-	if (sizeof(ee_ptr_int) != sizeof(ee_u8 *)) {
-		ee_printf("ERROR! Please define ee_ptr_int to a type that holds a pointer!\n");
-	}
-	if (sizeof(ee_u32) != 4) {
-		ee_printf("ERROR! Please define ee_u32 to a 32b unsigned type!\n");
-	}
-	p->portable_id=1;
+    if (sizeof(ee_ptr_int) != sizeof(ee_u8 *))
+    {
+        ee_printf(
+            "ERROR! Please define ee_ptr_int to a type that holds a "
+            "pointer!\n");
+    }
+    if (sizeof(ee_u32) != 4)
+    {
+        ee_printf("ERROR! Please define ee_u32 to a 32b unsigned type!\n");
+    }
+    p->portable_id = 1;
 }
 /* Function : portable_fini
-	Target specific final code 
+        Target specific final code
 */
-void portable_fini(core_portable *p)
+void
+portable_fini(core_portable *p)
 {
-	p->portable_id=0;
+    p->portable_id = 0;
 }
-
-
--- a/riscv-coremark/coremark/simple/core_portme.h
+++ b/riscv-coremark/coremark/simple/core_portme.h
@ -17,176 +17,188 @@ Original Author: Shay Gal-on
 */

 /* Topic : Description
-	This file contains configuration constants required to execute on different platforms
+        This file contains configuration constants required to execute on
+   different platforms
 */
 #ifndef CORE_PORTME_H
 #define CORE_PORTME_H
 /************************/
 /* Data types and settings */
 /************************/
-/* Configuration : HAS_FLOAT 
-	Define to 1 if the platform supports floating point.
+/* Configuration : HAS_FLOAT
+        Define to 1 if the platform supports floating point.
 */
-#ifndef HAS_FLOAT 
+#ifndef HAS_FLOAT
 #define HAS_FLOAT 1
 #endif
 /* Configuration : HAS_TIME_H
-	Define to 1 if platform has the time.h header file,
-	and implementation of functions thereof.
+        Define to 1 if platform has the time.h header file,
+        and implementation of functions thereof.
 */
 #ifndef HAS_TIME_H
 #define HAS_TIME_H 1
 #endif
 /* Configuration : USE_CLOCK
-	Define to 1 if platform has the time.h header file,
-	and implementation of functions thereof.
+        Define to 1 if platform has the time.h header file,
+        and implementation of functions thereof.
 */
 #ifndef USE_CLOCK
 #define USE_CLOCK 1
 #endif
 /* Configuration : HAS_STDIO
-	Define to 1 if the platform has stdio.h.
+        Define to 1 if the platform has stdio.h.
 */
 #ifndef HAS_STDIO
 #define HAS_STDIO 1
 #endif
 /* Configuration : HAS_PRINTF
-	Define to 1 if the platform has stdio.h and implements the printf function.
+        Define to 1 if the platform has stdio.h and implements the printf
+   function.
 */
 #ifndef HAS_PRINTF
 #define HAS_PRINTF 1
 #endif

 /* Configuration : CORE_TICKS
-	Define type of return from the timing functions.
+        Define type of return from the timing functions.
 */
 #include <time.h>
 typedef clock_t CORE_TICKS;

 /* Definitions : COMPILER_VERSION, COMPILER_FLAGS, MEM_LOCATION
-	Initialize these strings per platform
+        Initialize these strings per platform
 */
-#ifndef COMPILER_VERSION 
- #ifdef __GNUC__
- #define COMPILER_VERSION "GCC"__VERSION__
- #else
- #define COMPILER_VERSION "Please put compiler version here (e.g. gcc 4.1)"
- #endif
+#ifndef COMPILER_VERSION
+#ifdef __GNUC__
+#define COMPILER_VERSION "GCC"__VERSION__
+#else
+#define COMPILER_VERSION "Please put compiler version here (e.g. gcc 4.1)"
 #endif
-#ifndef COMPILER_FLAGS 
- #define COMPILER_FLAGS FLAGS_STR /* "Please put compiler flags here (e.g. -o3)" */
 #endif
-#ifndef MEM_LOCATION 
- #define MEM_LOCATION "STACK"
+#ifndef COMPILER_FLAGS
+#define COMPILER_FLAGS \
+    FLAGS_STR /* "Please put compiler flags here (e.g. -o3)" */
+#endif
+#ifndef MEM_LOCATION
+#define MEM_LOCATION "STACK"
 #endif

 /* Data Types :
-	To avoid compiler issues, define the data types that need ot be used for 8b, 16b and 32b in <core_portme.h>.
-	
-	*Imprtant* :
-	ee_ptr_int needs to be the data type used to hold pointers, otherwise coremark may fail!!!
+        To avoid compiler issues, define the data types that need ot be used for
+   8b, 16b and 32b in <core_portme.h>.
+
+        *Imprtant* :
+        ee_ptr_int needs to be the data type used to hold pointers, otherwise
+   coremark may fail!!!
 */
-typedef signed short ee_s16;
+typedef signed short   ee_s16;
 typedef unsigned short ee_u16;
-typedef signed int ee_s32;
-typedef double ee_f32;
-typedef unsigned char ee_u8;
-typedef unsigned int ee_u32;
-typedef ee_u32 ee_ptr_int;
-typedef size_t ee_size_t;
+typedef signed int     ee_s32;
+typedef double         ee_f32;
+typedef unsigned char  ee_u8;
+typedef unsigned int   ee_u32;
+typedef ee_u32         ee_ptr_int;
+typedef size_t         ee_size_t;
 /* align_mem :
-	This macro is used to align an offset to point to a 32b value. It is used in the Matrix algorithm to initialize the input memory blocks.
+        This macro is used to align an offset to point to a 32b value. It is
+   used in the Matrix algorithm to initialize the input memory blocks.
 */
-#define align_mem(x) (void *)(4 + (((ee_ptr_int)(x) - 1) & ~3))
+#define align_mem(x) (void *)(4 + (((ee_ptr_int)(x)-1) & ~3))

 /* Configuration : SEED_METHOD
-	Defines method to get seed values that cannot be computed at compile time.
-	
-	Valid values :
-	SEED_ARG - from command line.
-	SEED_FUNC - from a system function.
-	SEED_VOLATILE - from volatile variables.
+        Defines method to get seed values that cannot be computed at compile
+   time.
+
+        Valid values :
+        SEED_ARG - from command line.
+        SEED_FUNC - from a system function.
+        SEED_VOLATILE - from volatile variables.
 */
 #ifndef SEED_METHOD
 #define SEED_METHOD SEED_VOLATILE
 #endif

 /* Configuration : MEM_METHOD
-	Defines method to get a block of memry.
-	
-	Valid values :
-	MEM_MALLOC - for platforms that implement malloc and have malloc.h.
-	MEM_STATIC - to use a static memory array.
-	MEM_STACK - to allocate the data block on the stack (NYI).
+        Defines method to get a block of memry.
+
+        Valid values :
+        MEM_MALLOC - for platforms that implement malloc and have malloc.h.
+        MEM_STATIC - to use a static memory array.
+        MEM_STACK - to allocate the data block on the stack (NYI).
 */
 #ifndef MEM_METHOD
 #define MEM_METHOD MEM_STACK
 #endif

 /* Configuration : MULTITHREAD
-	Define for parallel execution 
-	
-	Valid values :
-	1 - only one context (default).
-	N>1 - will execute N copies in parallel.
-	
-	Note : 
-	If this flag is defined to more then 1, an implementation for launching parallel contexts must be defined.
-	
-	Two sample implementations are provided. Use <USE_PTHREAD> or <USE_FORK> to enable them.
-	
-	It is valid to have a different implementation of <core_start_parallel> and <core_end_parallel> in <core_portme.c>,
-	to fit a particular architecture. 
+        Define for parallel execution
+
+        Valid values :
+        1 - only one context (default).
+        N>1 - will execute N copies in parallel.
+
+        Note :
+        If this flag is defined to more then 1, an implementation for launching
+   parallel contexts must be defined.
+
+        Two sample implementations are provided. Use <USE_PTHREAD> or <USE_FORK>
+   to enable them.
+
+        It is valid to have a different implementation of <core_start_parallel>
+   and <core_end_parallel> in <core_portme.c>, to fit a particular architecture.
 */
 #ifndef MULTITHREAD
 #define MULTITHREAD 1
 #define USE_PTHREAD 0
-#define USE_FORK 0
-#define USE_SOCKET 0
+#define USE_FORK    0
+#define USE_SOCKET  0
 #endif

 /* Configuration : MAIN_HAS_NOARGC
-	Needed if platform does not support getting arguments to main. 
-	
-	Valid values :
-	0 - argc/argv to main is supported
-	1 - argc/argv to main is not supported
-	
-	Note : 
-	This flag only matters if MULTITHREAD has been defined to a value greater then 1.
+        Needed if platform does not support getting arguments to main.
+
+        Valid values :
+        0 - argc/argv to main is supported
+        1 - argc/argv to main is not supported
+
+        Note :
+        This flag only matters if MULTITHREAD has been defined to a value
+   greater then 1.
 */
-#ifndef MAIN_HAS_NOARGC 
+#ifndef MAIN_HAS_NOARGC
 #define MAIN_HAS_NOARGC 0
 #endif

 /* Configuration : MAIN_HAS_NORETURN
-	Needed if platform does not support returning a value from main. 
-	
-	Valid values :
-	0 - main returns an int, and return value will be 0.
-	1 - platform does not support returning a value from main
+        Needed if platform does not support returning a value from main.
+
+        Valid values :
+        0 - main returns an int, and return value will be 0.
+        1 - platform does not support returning a value from main
 */
 #ifndef MAIN_HAS_NORETURN
 #define MAIN_HAS_NORETURN 0
 #endif

 /* Variable : default_num_contexts
-	Not used for this simple port, must cintain the value 1.
+        Not used for this simple port, must cintain the value 1.
 */
 extern ee_u32 default_num_contexts;

-typedef struct CORE_PORTABLE_S {
-	ee_u8	portable_id;
+typedef struct CORE_PORTABLE_S
+{
+    ee_u8 portable_id;
 } core_portable;

 /* target specific init/fini */
 void portable_init(core_portable *p, int *argc, char *argv[]);
 void portable_fini(core_portable *p);

-#if !defined(PROFILE_RUN) && !defined(PERFORMANCE_RUN) && !defined(VALIDATION_RUN)
-#if (TOTAL_DATA_SIZE==1200)
+#if !defined(PROFILE_RUN) && !defined(PERFORMANCE_RUN) \
+    && !defined(VALIDATION_RUN)
+#if (TOTAL_DATA_SIZE == 1200)
 #define PROFILE_RUN 1
-#elif (TOTAL_DATA_SIZE==2000)
+#elif (TOTAL_DATA_SIZE == 2000)
 #define PERFORMANCE_RUN 1
 #else
 #define VALIDATION_RUN 1
--- a/riscv-coremark/riscv64-baremetal/core_portme.c
+++ b/riscv-coremark/riscv64-baremetal/core_portme.c
@ -125,6 +125,28 @@ void portable_free(void *p) {
 #if SAMPLE_TIME_IMPLEMENTATION
 /** Define Host specific (POSIX), or target specific global time variables. */
 static CORETIMETYPE start_time_val, stop_time_val;
+static unsigned long start_instr_val, stop_instr_val;
+
+/* Function: minstretFunc
+	This function will count the number of instructions.
+*/
+unsigned long minstretFunc(void)
+{
+	unsigned long minstretRead = read_csr(minstret);
+	//ee_printf("Minstret is %lu\n", minstretRead);
+	return minstretRead;
+}
+
+/* Function: minstretDiff
+	This function will take the difference between the first and second reads from the
+	MINSTRET csr to determine the number of machine instructions retired between two points
+	of time
+*/
+unsigned long minstretDiff(void)
+{
+	unsigned long minstretDifference = MYTIMEDIFF(stop_instr_val, start_instr_val);
+	return minstretDifference;
+}

 /* Function: start_time
 	This function will be called right before starting the timed portion of the benchmark.
@ -133,9 +155,10 @@ static CORETIMETYPE start_time_val, stop_time_val;
 	or zeroing some system parameters - e.g. setting the cpu clocks cycles to 0.
 */
 void start_time(void) {
+	start_instr_val = minstretFunc();
 	GETMYTIME(start_time_val);
-	ee_printf("Timer started\n");
-	ee_printf("  MTIME: %u\n", start_time_val);
+	//ee_printf("Timer started\n");
+	//ee_printf("  MTIME: %u\n", start_time_val);
 #if CALLGRIND_RUN
 	CALLGRIND_START_INSTRUMENTATION
 #endif
@ -157,8 +180,9 @@ void stop_time(void) {
    asm volatile("int3");/*1 */
 #endif
 	GETMYTIME(stop_time_val);
-	ee_printf("Timer stopped\n");
-	ee_printf("  MTIME: %u\n", stop_time_val);
+	stop_instr_val = minstretFunc();
+	//ee_printf("Timer stopped\n");
+	//ee_printf("  MTIME: %u\n", stop_time_val);
 }
 /* Function: get_time
 	Return an abstract "ticks" number that signifies time on the system.
@ -171,7 +195,8 @@ void stop_time(void) {
 */
 CORE_TICKS get_time(void) {
 	CORE_TICKS elapsed=(CORE_TICKS)(MYTIMEDIFF(stop_time_val, start_time_val));
-	ee_printf("    Elapsed MTIME: %u\n", elapsed);
+	//ee_printf("    Elapsed MTIME: %u\n", elapsed);
+	//ee_printf("    Elapsed MINSTRET: %lu\n", minstretDiff());
 	return elapsed;
 }
 /* Function: time_in_secs
@ -183,7 +208,7 @@ CORE_TICKS get_time(void) {
 secs_ret time_in_secs(CORE_TICKS ticks) {
 	secs_ret retval=((secs_ret)ticks) / (secs_ret)EE_TICKS_PER_SEC;
 	int retvalint = (int)retval;
-	ee_printf("  RETURN VALUE FROM TIME IN SECS FUNCTION: %d\n", retvalint);
+	//ee_printf("RETURN VALUE FROM TIME IN SECS FUNCTION: %d\n", retvalint);
 	return retval;
 }
 #else
--- a/wally-pipelined/config/coremark_bare/wally-config.vh
+++ b/wally-pipelined/config/coremark_bare/wally-config.vh
@ -34,7 +34,8 @@
 `define XLEN 64

 //`define MISA (32'h00000104)
-`define MISA (32'h00001104 | 1<<5 | 1<<18 | 1 << 20 | 1 << 12 | 1 << 0)
+//`define MISA (32'h00001104 | 1<<5 | 1<<18 | 1 << 20 | 1 << 12 | 1 << 0)
+`define MISA (32'h00000104 | 0 << 5 | 0 << 3 | 1 << 18 | 1 << 20 | 1 << 12 | 1 << 0)
 `define ZCSR_SUPPORTED 1
 `define COUNTERS 32
 `define ZCOUNTERS_SUPPORTED 1
@ -53,7 +54,7 @@
 `define DTLB_ENTRIES 32

 // Legal number of PMP entries are 0, 16, or 64
-`define PMP_ENTRIES 16
+`define PMP_ENTRIES 64

 // Address space
 `define RESET_VECTOR 64'h0000000080000000
@ -66,23 +67,23 @@
 // Range should be a thermometer code with 0's in the upper bits and 1s in the lower bits

 `define BOOTTIM_SUPPORTED 1'b1
-`define BOOTTIM_BASE   34'h00001000 
-`define BOOTTIM_RANGE  34'h00000FFF
+`define BOOTTIM_BASE   56'h00001000 
+`define BOOTTIM_RANGE  56'h00000FFF
 `define TIM_SUPPORTED 1'b1
-`define TIM_BASE       34'h80000000
-`define TIM_RANGE      34'h07FFFFFF
+`define TIM_BASE       56'h80000000
+`define TIM_RANGE      56'h07FFFFFF
 `define CLINT_SUPPORTED 1'b1
-`define CLINT_BASE  34'h02000000
-`define CLINT_RANGE 34'h0000FFFF
+`define CLINT_BASE  56'h02000000
+`define CLINT_RANGE 56'h0000FFFF
 `define GPIO_SUPPORTED 1'b1
-`define GPIO_BASE   34'h10012000
-`define GPIO_RANGE  34'h000000FF
+`define GPIO_BASE   56'h10012000
+`define GPIO_RANGE  56'h000000FF
 `define UART_SUPPORTED 1'b1
-`define UART_BASE   34'h10000000
-`define UART_RANGE  34'h00000007
+`define UART_BASE   56'h10000000
+`define UART_RANGE  56'h00000007
 `define PLIC_SUPPORTED 1'b1
-`define PLIC_BASE   34'h0C000000
-`define PLIC_RANGE  34'h03FFFFFF
+`define PLIC_BASE   56'h0C000000
+`define PLIC_RANGE  56'h03FFFFFF

 // Test modes

--- a/wally-pipelined/config/rv32icfd/BTBPredictor.txt
+++ b/wally-pipelined/config/rv32icfd/BTBPredictor.txt
--- a/wally-pipelined/config/rv32icfd/twoBitPredictor.txt
+++ b/wally-pipelined/config/rv32icfd/twoBitPredictor.txt
--- a/wally-pipelined/config/rv32icfd/wally-config.vh
+++ b/wally-pipelined/config/rv32icfd/wally-config.vh
@ -0,0 +1,106 @@
+//////////////////////////////////////////
+// wally-config.vh
+//
+// Written: David_Harris@hmc.edu 4 January 2021
+// Modified: 
+//
+// Purpose: Specify which features are configured
+//          Macros to determine which modes are supported based on MISA
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+// include shared configuration
+`include "wally-shared.vh"
+
+`define BUILDROOT 0
+`define BUSYBEAR 0
+
+// RV32 or RV64: XLEN = 32 or 64
+`define XLEN 32
+
+`define MISA (32'h00000104 | 1 << 5 | 1 << 20 | 1 << 18 | 1 << 12)
+`define ZCSR_SUPPORTED 1
+`define COUNTERS 32
+`define ZCOUNTERS_SUPPORTED 1
+
+// Microarchitectural Features
+`define UARCH_PIPELINED 1
+`define UARCH_SUPERSCALR 0
+`define UARCH_SINGLECYCLE 0
+`define MEM_DCACHE 0
+`define MEM_DTIM 1
+`define MEM_ICACHE 0
+`define MEM_VIRTMEM 1
+`define VECTORED_INTERRUPTS_SUPPORTED 1
+
+`define ITLB_ENTRIES 32
+`define DTLB_ENTRIES 32
+
+// Legal number of PMP entries are 0, 16, or 64
+`define PMP_ENTRIES 16
+
+// Address space
+`define RESET_VECTOR 32'h80000000
+
+// Peripheral Addresses
+// Peripheral memory space extends from BASE to BASE+RANGE
+// Range should be a thermometer code with 0's in the upper bits and 1s in the lower bits
+
+// *** each of these is `PA_BITS wide. is this paramaterizable INSIDE the config file?
+`define BOOTTIM_SUPPORTED 1'b1
+`define BOOTTIM_BASE   34'h00001000 
+`define BOOTTIM_RANGE  34'h00000FFF
+`define TIM_SUPPORTED 1'b1
+`define TIM_BASE       34'h80000000
+`define TIM_RANGE      34'h07FFFFFF
+`define CLINT_SUPPORTED 1'b1
+`define CLINT_BASE  34'h02000000
+`define CLINT_RANGE 34'h0000FFFF
+`define GPIO_SUPPORTED 1'b1
+`define GPIO_BASE   34'h10012000
+`define GPIO_RANGE  34'h000000FF
+`define UART_SUPPORTED 1'b1
+`define UART_BASE   34'h10000000
+`define UART_RANGE  34'h00000007
+`define PLIC_SUPPORTED 1'b1
+`define PLIC_BASE   34'h0C000000
+`define PLIC_RANGE  34'h03FFFFFF
+
+// Bus Interface width
+`define AHBW 32
+
+// Test modes
+
+// Tie GPIO outputs back to inputs
+`define GPIO_LOOPBACK_TEST 1
+
+// Hardware configuration
+`define UART_PRESCALE 1
+
+// Interrupt configuration
+`define PLIC_NUM_SRC 4
+// comment out the following if >=32 sources
+`define PLIC_NUM_SRC_LT_32
+`define PLIC_GPIO_ID 3
+`define PLIC_UART_ID 4
+
+`define TWO_BIT_PRELOAD "../config/rv32icfd/twoBitPredictor.txt"
+`define BTB_PRELOAD "../config/rv32icfd/BTBPredictor.txt"
+`define BPRED_ENABLED 1
+`define BPTYPE "BPGSHARE" // BPLOCALPAg or BPGLOBAL or BPTWOBIT or BPGSHARE
+`define TESTSBP 0
--- a/wally-pipelined/config/rv64icfd/wally-config.vh
+++ b/wally-pipelined/config/rv64icfd/wally-config.vh
@ -46,7 +46,7 @@
 `define MEM_DCACHE 0
 `define MEM_DTIM 1
 `define MEM_ICACHE 0
-`define MEM_VIRTMEM 0\1
+`define MEM_VIRTMEM 1
 `define VECTORED_INTERRUPTS_SUPPORTED 1

 `define ITLB_ENTRIES 32
@ -56,10 +56,7 @@
 `define PMP_ENTRIES 16

 // Address space
-`define RESET_VECTOR 64'h0000000080000000
-
-// Bus Interface width
-`define AHBW 64
+`define RESET_VECTOR 64'h80000000

 // Peripheral Addresses
 // Peripheral memory space extends from BASE to BASE+RANGE
@ -84,6 +81,9 @@
 `define PLIC_BASE   56'h0C000000
 `define PLIC_RANGE  56'h03FFFFFF

+// Bus Interface width
+`define AHBW 64
+
 // Test modes

 // Tie GPIO outputs back to inputs
@ -101,6 +101,7 @@

 `define TWO_BIT_PRELOAD "../config/rv64icfd/twoBitPredictor.txt"
 `define BTB_PRELOAD "../config/rv64icfd/BTBPredictor.txt"
+
 `define BPRED_ENABLED 1
 `define BPTYPE "BPGSHARE" // BPLOCALPAg or BPGLOBAL or BPTWOBIT or BPGSHARE
 `define TESTSBP 0
--- a/wally-pipelined/linux-testgen/WALLY-README.txt
+++ b/wally-pipelined/linux-testgen/WALLY-README.txt
@ -0,0 +1,57 @@
+If you do not need to update the Linux image, then go to ./linux-testvectors and 
+use tvCopier.py or tvLinker.sh to copy/link premade RAMs and testvectors from Tera.
+The RAMs are needed for Wally to run the Linux code, and the testvectors are needed
+to verify Wally is executing the code correctly.
+
+If you instead wish to regenerate the RAMs and testvectors from a new Linux image,
+you'll need to build the new Linux image, simulate it, and parse its output,
+as described below.
+
+*To build a new Linux image:
+     1. Git clone the Buildroot repository to ./buildroot:
+            git clone https://github.com/buildroot/buildroot.git 
+        For reference, Wally (*** will) be proven to work on an image built using
+        Buildroot when the following was the most recent commit to the Buildroot repo:
+            commit 4047e10ed6e20492bae572d4929eaa5d67eed746
+            Author: Gwenhael Goavec-Merou <gwenhael.goavec-merou@trabucayre.com>
+            Date:   Wed Jun 30 06:27:10 2021 +0200
+
+     2. If you wish to modify the configs, then in ./buildroot:
+        a. Run "make menuconfig" or "make linux-menuconfig" or "make busybox-menuconfig".
+        b. Use the TUI (terminal UI) to load in the existing configs.
+
+           For menuconfig, you can load in the source file from
+               "../buildroot-config-src/main.config"
+
+           For linux-menuconfig or busybox-menuconfig, load in from 
+               "../../../../buildroot-config-src/<type>.config"
+           because for linux and busybox, make traverses down to
+                ./buildroot/output/build/<linux or busybox>.
+          
+           One annoying thing about the TUI is that if it has a path already loaded,
+           then before you can enter the new path to buildroot-config-src, you need to
+           delete the existing one from the textbox. Doing so requires more than backspace.
+           Once you've deleted as much of the existing path as you can see, arrow left to 
+           check if there is more text you need to delete.
+
+        c. Likewise, when you are done editing, tell the TUI to save to the same location.
+
+     3. Finally go to ./buildroot-config-src and run make-buildroot.sh.
+        This script copies ./buildroot-config-src/main.config to ./buildroot/.config
+        and then invokes make. This is clumsy but effective because buildroot
+        sometimes does weird things to .config, like moving it to .config.old and 
+        making a new .config -- doing so can really mess up symbolic/hard links.
+
+     4. If you'd like debugging symbols, then reconfigure Buildroot to output "vmlinux"
+        and run make-buildroot again.
+
+*To generate new RAMs and testvectors from a Linux image:
+    1. sym link ./buildroot-image-output to either your new image in ./buildroot/output/image 
+       or the existing image at /courses/e190ax/buildroot-image-output on Tera. 
+       This might require first deleting the empty buildroot-image-output directory.
+    2. Then run ./testvector-generation/logBuildrootMem.sh to generate RAMs.
+    3. Then run ./testvector-generation/logAllBuildroot.sh to generate testvectors.
+
+       These latter two steps require QEMU.
+       Note that you can only have one instance of QEMU open at a time!
+       At least on Tera, it seems. Check "ps -ef" to see if anybody else is running QEMU.
--- a/wally-pipelined/linux-testgen/buildroot-config-src/busybox.config
+++ b/wally-pipelined/linux-testgen/buildroot-config-src/busybox.config
--- a/wally-pipelined/linux-testgen/buildroot-config-src/linux.config
+++ b/wally-pipelined/linux-testgen/buildroot-config-src/linux.config
--- a/wally-pipelined/linux-testgen/buildroot-config-src/main.config
+++ b/wally-pipelined/linux-testgen/buildroot-config-src/main.config
--- a/wally-pipelined/linux-testgen/buildroot-config-src/make-buildroot.sh
+++ b/wally-pipelined/linux-testgen/buildroot-config-src/make-buildroot.sh
@ -0,0 +1,3 @@
+cp main.config ../buildroot/.config
+cd ../buildroot
+make
--- a/wally-pipelined/linux-testgen/fix_mem.py
+++ b/wally-pipelined/linux-testgen/fix_mem.py
@ -1,9 +0,0 @@
-#! /usr/bin/python3
-test_dir = '/courses/e190ax/buildroot_boot/'
-infiles = ['bootmemGDB.txt', 'ramGDB.txt']
-outfiles = ['bootmem.txt', 'ram.txt']
-for i in range(len(infiles)):
-    with open(f'{test_dir}{infiles[i]}', 'r') as f:
-        with open(f'{test_dir}{outfiles[i]}', 'w') as w:
-            for l in f:
-                w.write(f'{"".join([x[2:] for x in l.split()[:0:-1]])}\n')
--- a/wally-pipelined/linux-testgen/gdbinit_qemulog
+++ b/wally-pipelined/linux-testgen/gdbinit_qemulog
@ -1,10 +0,0 @@
-set pagination off
-target extended-remote :1234
-b *0xffffffe00020144e
-c
-c
-c
-c
-set confirm off
-kill
-q
--- a/wally-pipelined/linux-testgen/linux-testvectors/intermediate-outputs/git_create_dir.txt
+++ b/wally-pipelined/linux-testgen/linux-testvectors/intermediate-outputs/git_create_dir.txt
@ -0,0 +1 @@
+This file only exists so that git will create ./.
--- a/wally-pipelined/linux-testgen/linux-testvectors/tvUnlinker.sh
+++ b/wally-pipelined/linux-testgen/linux-testvectors/tvUnlinker.sh
@ -0,0 +1,10 @@
+# This could be nice to use if you want to mess with the testvectors
+# without corrupting the stable copies on Tera.
+unlink parsedCSRs.txt
+unlink parsedMemRead.txt
+unlink parsedMemWrite.txt
+unlink parsedPC.txt
+unlink parsedRegs.txt
+unlink bootmem.txt
+unlink ram.txt
+echo "Done!"
--- a/wally-pipelined/linux-testgen/logAllBuildroot.sh
+++ b/wally-pipelined/linux-testgen/logAllBuildroot.sh
@ -1,40 +0,0 @@
-# Oftentimes this script runs so long you'll go to sleep.
-# But you don't want the script to die when your computer goes to sleep.
-# So consider invoking this with nohup (i.e. "nohup ./logAllBuildroot.sh")
-# You can run "tail -f nohup.out" to see what would've
-# outputted to the terminal if you didn't use nohup
-
-# =========== Debug the Process ========== 
-# Uncomment this version for GDB/QEMU debugging
-# - Opens up GDB interactively
-# - Logs raw QEMU output to qemu_output.txt
-#(qemu-system-riscv64 -M virt -nographic -bios /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/fw_jump.elf -kernel /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/Image -append "root=/dev/vda ro" -initrd /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/rootfs.cpio -d nochain,cpu,in_asm -serial /dev/null -singlestep -s -S 2> qemu_output.txt) & riscv64-unknown-elf-gdb
-
-# Uncomment this version to generate qemu_output.txt
-# - Uses GDB script
-# - Logs raw QEMU output to qemu_output.txt
-#(qemu-system-riscv64 -M virt -nographic -bios /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/fw_jump.elf -kernel /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/Image -append "root=/dev/vda ro" -initrd /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/rootfs.cpio -d nochain,cpu,in_asm -serial /dev/null -singlestep -s -S 2>qemu_output.txt) & riscv64-unknown-elf-gdb -x gdbinit_qemulog_debug
-
-# Uncomment this version for parse_qemu.py debugging
-# - Uses qemu_output.txt
-# - Makes qemu_in_gdb_format.txt
-# - Logs parse_qemu.py's simulated gdb output to qemu_in_gdb_format.txt
-#cat qemu_output.txt | ./parse_qemu.py >qemu_in_gdb_format.txt
-#cat qemu_output.txt | ./parse_qemu.py | ./parse_gdb_output.py "/courses/e190ax/buildroot_boot/"
-
-# Uncomment this version in case you just want to have qemu_in_gdb_format.txt around
-# It is often helpful for general debugging
-(qemu-system-riscv64 -M virt -nographic -bios /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/fw_jump.elf -kernel /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/Image -append "root=/dev/vda ro" -initrd /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/rootfs.cpio -d nochain,cpu,in_asm -serial /dev/null -singlestep -s -S 2>&1 >/dev/null | ./parse_qemu.py >/courses/e190ax/buildroot_boot/qemu_in_gdb_format.txt) & riscv64-unknown-elf-gdb -x gdbinit_qemulog
-# Split qemu_in_gdb_format.txt into chunks of 100,000 instructions for easier inspection
-#cd /courses/e190ax/buildroot_boot
-#split -d -l 5600000 qemu_in_gdb_format.txt --verbose
-
-# Uncomment this version for parse_gdb_output.py debugging
-# - Uses qemu_in_gdb_format.txt
-# - Logs info needed by buildroot testbench
-#cat qemu_in_gdb_format.txt | ./parse_gdb_output.py "/courses/e190ax/buildroot_boot/"
-
-# =========== Just Do the Thing ========== 
-# Uncomment this version for the whole thing 
-# - Logs info needed by buildroot testbench
-#(qemu-system-riscv64 -M virt -nographic -bios /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/fw_jump.elf -kernel /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/Image -append "root=/dev/vda ro" -initrd /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/rootfs.cpio -d nochain,cpu,in_asm -serial /dev/null -singlestep -s -S 2>&1 >/dev/null | ./parse_qemu.py | ./parse_gdb_output.py "/courses/e190ax/buildroot_boot_new/") & riscv64-unknown-elf-gdb -x gdbinit_qemulog
--- a/wally-pipelined/linux-testgen/logBuildrootMem.sh
+++ b/wally-pipelined/linux-testgen/logBuildrootMem.sh
@ -1,4 +0,0 @@
-(qemu-system-riscv64 -M virt -nographic -bios /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/fw_jump.elf -kernel /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/Image -append "root=/dev/vda ro" -initrd /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/rootfs.cpio -d nochain,cpu,in_asm -serial /dev/null -singlestep -s -S 2>/dev/null >/dev/null ) &
-riscv64-unknown-elf-gdb -x gdbinit_mem
-#sed -i '$d' $file
-echo "Done"
--- a/wally-pipelined/linux-testgen/testvector-generation/combineGDBs.py
+++ b/wally-pipelined/linux-testgen/testvector-generation/combineGDBs.py
--- a/wally-pipelined/linux-testgen/testvector-generation/deprecated-scripts/parseAllBusybear.sh
+++ b/wally-pipelined/linux-testgen/testvector-generation/deprecated-scripts/parseAllBusybear.sh
--- a/wally-pipelined/linux-testgen/testvector-generation/deprecated-scripts/setup_OVP.sh
+++ b/wally-pipelined/linux-testgen/testvector-generation/deprecated-scripts/setup_OVP.sh
--- a/wally-pipelined/linux-testgen/testvector-generation/deprecated-scripts/start_OVP.sh
+++ b/wally-pipelined/linux-testgen/testvector-generation/deprecated-scripts/start_OVP.sh
--- a/wally-pipelined/linux-testgen/testvector-generation/find_csr.sh
+++ b/wally-pipelined/linux-testgen/testvector-generation/find_csr.sh
--- a/wally-pipelined/linux-testgen/testvector-generation/fix_mem.py
+++ b/wally-pipelined/linux-testgen/testvector-generation/fix_mem.py
@ -0,0 +1,11 @@
+#! /usr/bin/python3
+test_dir = '../'
+gdbMemfileDir = '../linux-testvectors/intermediate-outputs/'
+fixedMemfileDir = '../linux-testvectors/'
+infiles = ['bootmemGDB.txt', 'ramGDB.txt']
+outfiles = ['bootmem.txt', 'ram.txt']
+for i in range(len(infiles)):
+    with open(f'{gdbMemfileDir}{infiles[i]}', 'r') as f:
+        with open(f'{fixedMemfileDir}{outfiles[i]}', 'w') as w:
+            for l in f:
+                w.write(f'{"".join([x[2:] for x in l.split()[:0:-1]])}\n')
--- a/wally-pipelined/linux-testgen/testvector-generation/gdbinit
+++ b/wally-pipelined/linux-testgen/testvector-generation/gdbinit
--- a/wally-pipelined/linux-testgen/testvector-generation/gdbinit_debug
+++ b/wally-pipelined/linux-testgen/testvector-generation/gdbinit_debug
@ -0,0 +1,3 @@
+file ../buildroot-image-output/vmlinux
+set pagination off
+target extended-remote :1236
--- a/wally-pipelined/linux-testgen/testvector-generation/gdbinit_mem
+++ b/wally-pipelined/linux-testgen/testvector-generation/gdbinit_mem
@ -1,20 +1,20 @@
 set pagination off
-target extended-remote :1234
+target extended-remote :1235
 set logging overwrite on
 set logging redirect on
 printf "Creating bootmemGDB.txt\n"
-set logging file /courses/e190ax/buildroot_boot/bootmemGDB.txt
+set logging file ../linux-testvectors/intermediate-outputs/bootmemGDB.txt
 set logging on
 x/4096xb 0x1000
 set logging off
 printf "Creating bootmem_untrimmed_GDB.txt\n"
 printf "Warning - please verify that the second half of bootmem_untrimmed_GDB.txt is all 0s\n"
-set logging file /courses/e190ax/buildroot_boot/bootmem_untrimmed_GDB.txt
+set logging file ../linux-testvectors/intermediate-outputs/bootmem_untrimmed_GDB.txt
 set logging on
 x/8192xb 0x1000
 set logging off
 printf "Creating ramGDB.txt\n"
-set logging file /courses/e190ax/buildroot_boot/ramGDB.txt
+set logging file ../linux-testvectors/intermediate-outputs/ramGDB.txt
 set logging on
 x/134217728xb 0x80000000
 set logging off
--- a/wally-pipelined/linux-testgen/testvector-generation/gdbinit_qemulog
+++ b/wally-pipelined/linux-testgen/testvector-generation/gdbinit_qemulog
@ -0,0 +1,11 @@
+set pagination off
+target extended-remote :1236
+file ../buildroot-image-output/vmlinux
+b arch_cpu_idle
+c
+c
+c
+c
+set confirm off
+kill
+q
--- a/wally-pipelined/linux-testgen/testvector-generation/gdbinit_qemulog_debug
+++ b/wally-pipelined/linux-testgen/testvector-generation/gdbinit_qemulog_debug
@ -1,9 +1,10 @@
 set pagination off
 target extended-remote :1234
+maint print symbols symbols.txt
 b *0x000000008020103c
 c
 del 1
-stepi 100
+stepi 100000
 set confirm off
 kill
 q
--- a/wally-pipelined/linux-testgen/testvector-generation/logAllBuildroot.sh
+++ b/wally-pipelined/linux-testgen/testvector-generation/logAllBuildroot.sh
@ -0,0 +1,44 @@
+# Oftentimes this script runs so long you'll go to sleep.
+# But you don't want the script to die when your computer goes to sleep.
+# So consider invoking this with nohup (i.e. "nohup ./logAllBuildroot.sh")
+# You can run "tail -f nohup.out" to see what would've
+# outputted to the terminal if you didn't use nohup
+
+customQemu="/courses/e190ax/qemu_sim/rv64_initrd/qemu_experimental/qemu/build/qemu-system-riscv64"
+#customQemu="qemu-system-riscv64"
+imageDir="../buildroot-image-output"
+intermedDir="../linux-testvectors/intermediate-outputs"
+outDir="../linux-testvectors"
+
+# =========== Debug the Process ========== 
+# Uncomment this version for QEMU debugging of kernel
+#  - good for poking around VM if it boots up
+#  - good for running QEMU commands (press "Ctrl-A" then "c" to open QEMU command prompt)
+#$customQemu -M virt -nographic -bios $imageDir/fw_jump.elf -kernel $imageDir/Image -append "root=/dev/vda ro" -initrd $imageDir/rootfs.cpio 
+# Uncomment this version for GDB debugging of kernel
+#  - attempts to load in symbols from "vmlinux"
+#  - good for looking at backtraces when Linux gets stuck for some reason 
+#$customQemu -M virt -nographic -bios $imageDir/fw_jump.elf -kernel $imageDir/Image -append "root=/dev/vda ro" -initrd $imageDir/rootfs.cpio -gdb tcp::1236 -S & riscv64-unknown-elf-gdb -x gdbinit_debug
+
+# Uncomment this version to generate qemu_output.txt
+# - Uses GDB script
+# - Logs raw QEMU output to qemu_output.txt
+#($customQemu -M virt -nographic -bios $imageDir/fw_jump.elf -kernel $imageDir/Image -append "root=/dev/vda ro" -initrd $imageDir/rootfs.cpio -d nochain,cpu,in_asm -serial /dev/null -singlestep -gdb tcp::1236 -S 2> $intermedDir/qemu_output.txt) & riscv64-unknown-elf-gdb -x gdbinit_qemulog_debug
+
+# Uncomment this version for parse_qemu.py debugging
+# - Uses qemu_output.txt
+# - Makes qemu_in_gdb_format.txt
+# - Splits qemu_in_gdb_format.txt into chunks of 100,000 instrs
+#cat $intermedDir/qemu_output.txt | ./parse_qemu.py >$intermedDir/qemu_in_gdb_format.txt
+#cd $intermedDir
+#split -d -l 5600000 ./qemu_in_gdb_format.txt --verbose
+#cd ../../testvector-generation
+
+# Uncomment this version for parse_gdb_output.py debugging
+# - Uses qemu_in_gdb_format.txt
+# - Makes testvectors#cat $intermedDir/qemu_in_gdb_format.txt | ./parse_gdb_output.py "$outDir"
+
+# =========== Just Do the Thing ========== 
+# Uncomment this version for the whole thing 
+# - Logs info needed by buildroot testbench
+($customQemu -M virt -nographic -bios $imageDir/fw_jump.elf -kernel $imageDir/Image -append "root=/dev/vda ro" -initrd $imageDir/rootfs.cpio -d nochain,cpu,in_asm -serial /dev/null -singlestep -gdb tcp::1236 -S 2>&1 >/dev/null | ./parse_qemu.py | ./parse_gdb_output.py "$outDir") & riscv64-unknown-elf-gdb -x gdbinit_qemulog
--- a/wally-pipelined/linux-testgen/testvector-generation/logBuildrootMem.sh
+++ b/wally-pipelined/linux-testgen/testvector-generation/logBuildrootMem.sh
@ -0,0 +1,7 @@
+customQemu="/courses/e190ax/qemu_sim/rv64_initrd/qemu_experimental/qemu/build/qemu-system-riscv64"
+imageDir="../buildroot-image-output"
+($customQemu -M virt -nographic -bios $imageDir/fw_jump.elf -kernel $imageDir/Image -append "root=/dev/vda ro" -initrd $imageDir/rootfs.cpio -d nochain,cpu,in_asm -serial /dev/null -singlestep -gdb tcp::1235 -S 2>/dev/null >/dev/null) &
+riscv64-unknown-elf-gdb -x gdbinit_mem
+echo "Translating Mem from GDB to Questa format"
+./fix_mem.py
+echo "Done"
--- a/wally-pipelined/linux-testgen/testvector-generation/parse_gdb_output.py
+++ b/wally-pipelined/linux-testgen/testvector-generation/parse_gdb_output.py
@ -9,7 +9,7 @@ csrs = ['fcsr','mcause','mcounteren','medeleg','mepc','mhartid','mideleg','mie',
 list(map(csrs.remove, ['fcsr','mhartid','pmpcfg0','pmpaddr0','mip']))
 #output_path = '/courses/e190ax/busybear_boot_new/'
 #output_path = '/courses/e190ax/buildroot_boot/'
-output_path = sys.argv[1]
+output_path = sys.argv[1]+'/'
 print(f'output dir: {output_path}')
 instrs = -1
 try:
--- a/wally-pipelined/linux-testgen/testvector-generation/parse_qemu.py
+++ b/wally-pipelined/linux-testgen/testvector-generation/parse_qemu.py
@ -3,13 +3,16 @@ import fileinput, sys

 sys.stderr.write("reminder: this script takes input from stdin\n")
 parseState = "idle"
+beginPageFault = 0
 inPageFault = 0
+endPageFault = 0
 CSRs = {}
 pageFaultCSRs = {}
 regs = {}
 pageFaultRegs = {}
 instrs = {}
 instrCount = 0
+returnAdr = 0

 def printPC(l):
    global parseState, inPageFault, CSRs, pageFaultCSRs, regs, pageFaultCSRs, instrs, instrCount
@ -33,8 +36,8 @@ def printCSRs():

 def parseCSRs(l):
    global parseState, inPageFault, CSRs, pageFaultCSRs, regs, pageFaultCSRs, instrs
-    if l.strip() and (not l.startswith("Disassembler")) and (not l.startswith("Please")):
-        if l.startswith(' x0/zero'):
+    if l.strip() and (not l.startswith("Disassembler")) and (not l.startswith("Please")) and not inPageFault:
+        if l.startswith(' x0/zero'): 
            parseState = "regFile"
            instr = instrs[CSRs["pc"]]
            printPC(instr)
@ -42,24 +45,31 @@ def parseCSRs(l):
        else:
            csr = l.split()[0]
            val = int(l.split()[1],16)
-            if inPageFault:
+            # Commented out this conditional because the pageFault instrs don't corrupt CSRs
+            #if inPageFault:
                # Not sure if these CSRs should be updated or not during page fault.
-                if l.startswith("mstatus") or l.startswith("mepc") or l.startswith("mcause") or l.startswith("mtval") or l.startswith("sepc") or l.startswith("scause") or l.startswith("stval"):
+                #if l.startswith("mstatus") or l.startswith("mepc") or l.startswith("mcause") or l.startswith("mtval") or l.startswith("sepc") or l.startswith("scause") or l.startswith("stval"):
                    # We do update some CSRs
-                    CSRs[csr] = val
-                else:
+                #    CSRs[csr] = val
+                #else:
                    # Others we preserve until changed later
-                    pageFaultCSRs[csr] = val
-            elif pageFaultCSRs and (csr in pageFaultCSRs):
-                if (val != pageFaultCSRs[csr]):
-                    del pageFaultCSRs[csr]
-                    CSRs[csr] = val
+                #    pageFaultCSRs[csr] = val
+            #elif pageFaultCSRs and (csr in pageFaultCSRs):
+            #    if (val != pageFaultCSRs[csr]):
+            #        del pageFaultCSRs[csr]
+            #        CSRs[csr] = val
+            #else:
+            #    CSRs[csr] = val
+            #
+            # However SEPC and STVAL do get corrupted upon exiting
+            if endPageFault and ((csr == 'sepc') or (csr == 'stval')):
+                CSRs[csr] = returnAdr
            else:
                CSRs[csr] = val

 def parseRegs(l):
    global parseState, inPageFault, CSRs, pageFaultCSRs, regs, pageFaultCSRs, instrs
-    if "mcounteren" in l:
+    if "pc" in l:
        printCSRs()
        # New non-disassembled instruction
        parseState = "CSRs"
@ -100,8 +110,12 @@ for l in fileinput.input():
    elif (parseState == "instr") and l.startswith('0x'):
        if "out of bounds" in l:
            sys.stderr.write("Detected QEMU page fault error\n")
+            beginPageFault = ~(inPageFault)
+            if beginPageFault:
+                returnAdr = int(l.split()[0][2:-1], 16)
            inPageFault = 1
        else: 
+            endPageFault = inPageFault
            inPageFault = 0
            adr = int(l.split()[0][2:-1], 16)
            instrs[adr] = l
--- a/wally-pipelined/regression/sim-wally-batch-rv32icfd
+++ b/wally-pipelined/regression/sim-wally-batch-rv32icfd
@ -0,0 +1,3 @@
+vsim -c <<!
+do wally-pipelined-batch-rv32icfd.do ../config/rv32icfd rv32icfd
+!
--- a/wally-pipelined/regression/sim-wally-rv32icfd
+++ b/wally-pipelined/regression/sim-wally-rv32icfd
@ -0,0 +1 @@
+vsim -do wally-pipelined-rv32icfd.do
--- a/wally-pipelined/regression/wally-pipelined-batch-rv32icfd.do
+++ b/wally-pipelined/regression/wally-pipelined-batch-rv32icfd.do
@ -0,0 +1,42 @@
+# wally-pipelined-batch.do 
+#
+# Modification by Oklahoma State University & Harvey Mudd College
+# Use with Testbench 
+# James Stine, 2008; David Harris 2021
+# Go Cowboys!!!!!!
+#
+# Takes 1:10 to run RV64IC tests using gui
+
+# Use this wally-pipelined-batch.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do wally-pipelined-batch.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do wally-pipelined-batch.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work_$2] {
+    vdel -lib work_$2 -all
+}
+vlib work_$2
+
+# compile source files
+# suppress spurious warnngs about 
+# "Extra checking for conflicts with always_comb done at vopt time"
+# because vsim will run vopt
+
+# default to config/rv64ic, but allow this to be overridden at the command line.  For example:
+switch $argc {
+    0 {vlog +incdir+../config/rv32icfd +incdir+../config/shared ../testbench/testbench-imperas.sv ../src/*/*.sv -suppress 2583}
+    1 {vlog +incdir+$1 +incdir+../config/shared ../testbench/testbench-imperas.sv  ../src/*/*.sv -suppress 2583}
+    2 {vlog -work work_$2 +incdir+$1 +incdir+../config/shared ../testbench/testbench-imperas.sv  ../src/*/*.sv -suppress 2583}
+}
+# start and run simulation
+# remove +acc flag for faster sim during regressions if there is no need to access internal signals
+vopt work_$2.testbench -work work_$2 -o workopt_$2
+vsim -lib work_$2 workopt_$2
+
+run -all
+quit
--- a/wally-pipelined/regression/wally-pipelined-rv32icfd.do
+++ b/wally-pipelined/regression/wally-pipelined-rv32icfd.do
@ -0,0 +1,50 @@
+# wally-pipelined.do 
+#
+# Modification by Oklahoma State University & Harvey Mudd College
+# Use with Testbench 
+# James Stine, 2008; David Harris 2021
+# Go Cowboys!!!!!!
+#
+# Takes 1:10 to run RV64IC tests using gui
+
+# Use this wally-pipelined.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do wally-pipelined.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do wally-pipelined.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work] {
+    vdel -all
+}
+vlib work
+
+# compile source files
+# suppress spurious warnngs about 
+# "Extra checking for conflicts with always_comb done at vopt time"
+# because vsim will run vopt
+
+# default to config/rv64ic, but allow this to be overridden at the command line.  For example:
+# do wally-pipelined.do ../config/rv32ic
+switch $argc {
+    0 {vlog +incdir+../config/rv32icfd +incdir+../config/shared ../testbench/testbench-imperas.sv ../src/*/*.sv -suppress 2583}
+    1 {vlog +incdir+$1  +incdir+../config/shared ../testbench/testbench-imperas.sv ../testbench/function_radix.sv ../src/*/*.sv -suppress 2583}
+}
+# start and run simulation
+# remove +acc flag for faster sim during regressions if there is no need to access internal signals
+vopt +acc work.testbench -o workopt 
+vsim workopt
+
+view wave
+-- display input and output signals as hexidecimal values
+do ./wave-dos/default-waves.do
+
+-- Run the Simulation 
+#run 5000 
+run -all
+#quit
+noview ../testbench/testbench-imperas.sv
+view wave
--- a/wally-pipelined/regression/wave-all.do
+++ b/wally-pipelined/regression/wave-all.do
@ -152,7 +152,7 @@ add wave -noupdate -radix hexadecimal /testbench/dut/hart/PCTargetE
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/CSRReadValW
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/PrivilegedNextPCM
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/MemRWM
-add wave -noupdate -radix hexadecimal /testbench/dut/hart/InstrValidW
+add wave -noupdate -radix hexadecimal /testbench/dut/hart/InstrValidM
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/InstrMisalignedFaultM
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/DataMisalignedM
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/IllegalBaseInstrFaultD
@ -168,7 +168,7 @@ add wave -noupdate -radix hexadecimal /testbench/dut/hart/CSRWritePendingDEM
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/LoadStallD
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/SetFflagsM
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/FRM_REGW
-add wave -noupdate -radix hexadecimal /testbench/dut/hart/FloatRegWriteW
+add wave -noupdate -radix hexadecimal /testbench/dut/hart/FRegWriteM
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/MemRWAlignedM
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/Funct3M
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/MemAdrM
@ -337,7 +337,7 @@ add wave -noupdate -radix hexadecimal /testbench/dut/hart/ieu/Funct3M
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/ieu/ReadDataW
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/ieu/CSRReadValW
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/ieu/PCLinkW
-add wave -noupdate -radix hexadecimal /testbench/dut/hart/ieu/InstrValidW
+add wave -noupdate -radix hexadecimal /testbench/dut/hart/ieu/InstrValidM
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/ieu/StallD
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/ieu/FlushD
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/ieu/FlushE
@ -397,7 +397,7 @@ add wave -noupdate -radix hexadecimal /testbench/dut/hart/ieu/c/RegWriteM
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/ieu/c/FlushW
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/ieu/c/RegWriteW
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/ieu/c/ResultSrcW
-add wave -noupdate -radix hexadecimal /testbench/dut/hart/ieu/c/InstrValidW
+add wave -noupdate -radix hexadecimal /testbench/dut/hart/ieu/c/InstrValidM
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/ieu/c/CSRWritePendingDEM
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/ieu/c/RegWriteD
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/ieu/c/RegWriteE
@ -740,8 +740,8 @@ add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/CSRReadValW
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/PrivilegedNextPCM
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/RetM
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/TrapM
-add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/InstrValidW
-add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/FloatRegWriteW
+add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/InstrValidM
+add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/FRegWriteM
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/LoadStallD
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/PrivilegedM
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/InstrMisalignedFaultM
@ -842,8 +842,8 @@ add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/uretM
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/TimerIntM
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/ExtIntM
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/SwIntM
-add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/InstrValidW
-add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/FloatRegWriteW
+add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/InstrValidM
+add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/FRegWriteM
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/LoadStallD
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/NextPrivilegeModeM
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/PrivilegeModeW
@ -937,7 +937,7 @@ add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/genblk1/csrsr
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/genblk1/csrsr/WriteSSTATUSM
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/genblk1/csrsr/WriteUSTATUSM
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/genblk1/csrsr/TrapM
-add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/genblk1/csrsr/FloatRegWriteW
+add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/genblk1/csrsr/FRegWriteM
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/genblk1/csrsr/NextPrivilegeModeM
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/genblk1/csrsr/PrivilegeModeW
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/genblk1/csrsr/mretM
@ -972,7 +972,7 @@ add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/genblk1/csrsr
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/genblk1/csrsr/STATUS_UIE
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/genblk1/counters/clk
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/genblk1/counters/reset
-add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/genblk1/counters/InstrValidW
+add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/genblk1/counters/InstrValidM
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/genblk1/counters/LoadStallD
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/genblk1/counters/CSRMWriteM
 add wave -noupdate -radix hexadecimal /testbench/dut/hart/priv/csr/genblk1/counters/CSRAdrM
--- a/wally-pipelined/regression/wave-dos/default-waves.do
+++ b/wally-pipelined/regression/wave-dos/default-waves.do
@ -8,7 +8,7 @@ add wave /testbench/clk
 add wave /testbench/reset
 add wave -divider
 #add wave /testbench/dut/hart/ebu/IReadF
-add wave /testbench/dut/hart/DataStall
+#add wave /testbench/dut/hart/DataStall
 add wave /testbench/dut/hart/ICacheStallF
 add wave /testbench/dut/hart/StallF
 add wave /testbench/dut/hart/StallD
--- a/wally-pipelined/src/fpu/FMA/add.sv
+++ b/wally-pipelined/src/fpu/FMA/add.sv
@ -1,65 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-//
-// Block Name:	add.v
-// Author:		David Harris
-// Date:		11/12/1995
-//
-// Block Description:
-//       This block performs the addition of the product and addend.   It also
-//   contains logic necessary to adjust the signs for effective subtracts 
-//   and negative results. 
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-module add(rM, sM, tM, sum,
-		   negsum, invz, selsum1, negsum0, negsum1, killprodM);
-////////////////////////////////////////////////////////////////////////////////
-
-	input logic 		[105:0]		rM;     			// partial product 1
-	input logic 		[105:0]		sM;              // partial product 2
-	input logic 		[163:0]		tM;             	// aligned addend 
-	input logic					invz;       	// invert addend
-	input logic 					selsum1;    	// select +1 mode of compound adder 
-	input logic					killprodM;    	// z >> product
-	input logic					negsum;      	// Negate sum 
-	output logic		[163:0]		sum;         	// sum
-	output logic					negsum0;     	// sum was negative in +0 mode
-	output logic					negsum1;     	// sum was negative in +1 mode 
-
-	// Internal nodes
-
-	wire		[105:0]		r2;				// partial product possibly zeroed out
-	wire		[105:0]		s2;				// partial product possibly zeroed out
-	wire		[164:0]		t2;				// addend after inversion if necessary
-	wire		[164:0] 	sum0;			// sum of compound adder +0 mode
-	wire		[164:0] 	sum1;			// sum of compound adder +1 mode
-	wire		[163:0] 	prodshifted;			// sum of compound adder +1 mode
-	wire		[164:0] 	tmp;			// sum of compound adder +1 mode
-
-	// Invert addend if z'sM sign is diffrent from the product'sM sign
-
-	assign t2 = invz ? ~{1'b0,tM} : {1'b0,tM};
-	
-	// Zero out product if Z >> product or product really should be 	
-
-	assign r2 = killprodM ? 106'b0 : rM;
-	assign s2 = killprodM ? 106'b0 : sM;
-
-	//***replace this with a more structural cpa that synthisises better
-	// Compound adder
-	// Consists of 3:2 CSA followed by long compound CPA
-	//assign prodshifted = killprodM ? 0 : {56'b0, r2+s2, 2'b0};
-	//assign tmp = ({{57{r2[105]}},r2, 2'b0} + {{57{s2[105]}},s2, 2'b0});
-	assign sum0 = t2 + 164'b0 + {57'b0, r2+s2, 2'b0};
-	assign sum1 = t2 + 164'b1 + {57'b0, r2+s2, 2'b0}; // +1 from invert of z above
-	
-	// Check sign bits in +0/1 modes 
-	assign negsum0 = sum0[164];
-	assign negsum1 = sum1[164];
-
-	// Mux proper result (+Oil mode and inversion) using 4:1 mux
- 	//assign sumzero = |sum;
-	assign sum = selsum1 ? (negsum ? -sum1[163:0] : sum1[163:0]) : (negsum ? -sum0[163:0] : sum0[163:0]);
-	
-endmodule
-
--- a/wally-pipelined/src/fpu/FMA/align.sv
+++ b/wally-pipelined/src/fpu/FMA/align.sv
@ -1,88 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	align.v
-// Author:		David Harris
-// Date:		11/2/1995
-//
-// Block Description:
-//   This block implements the alignment shifter.   It is responsible for
-//   adjusting the fraction portion of the addend relative to the fraction
-//   produced in the multiplier array.
-//
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module align(zman, aligncntE, xzeroE, yzeroE, zzeroE, zdenormE, tE, bsE, 
-             killprodE,  sumshiftE, sumshiftzeroE);
-/////////////////////////////////////////////////////////////////////////////
-
-	input logic 		[51:0]		zman;		// Fraction of addend z;
-	input logic 		[12:0]		aligncntE;	// amount to shift
-	input logic				xzeroE;		// Input X = 0
-	input logic                  		yzeroE;          // Input Y = 0 
-	input logic                  		zzeroE;          // Input Z = 0
-	input logic                  		zdenormE;        // Input Z is denormalized
-	output logic    	[163:0]    	tE;              // aligned addend (54 bits left of bpt)
-	output logic          		bsE;           	// sticky bit of addend
-	output logic          		killprodE;    	// Z >> product
-	output logic		[8:0]		sumshiftE;	
-	output logic				sumshiftzeroE;
-
-	// Internal nodes
- 
-	reg       	[215:0]   	shift;				// aligned addend from shifter
-	logic 		[12:0]		tmp;
-	
-
-
-	always_comb 
-		begin
-
-		// Default to clearing sticky bits 
-		bsE = 0;
-
-		// And to using product as primary operand in adder I exponent gen 
-		killprodE = xzeroE | yzeroE;
-		// d = aligncntE
-		// p = 53
-		//***try reducing this hardware to use one shifter
-		if ($signed(aligncntE) <= $signed(-(13'd105))) begin //d<=-2p+1
-			//product ancored case with saturated shift
-			sumshiftE = 163;	// 3p+4	
-			sumshiftzeroE = 0;
-			shift = {1'b1,zman,163'b0} >> sumshiftE;
-			tE = zzeroE ? 0 : {shift[215:52]};
-			bsE = |(shift[51:0]);
-
-		end else if($signed(aligncntE) <= $signed(13'd2))  begin // -2p+1<d<=2
-			// product ancored or cancellation
-			tmp = 13'd57-aligncntE;
-			sumshiftE = tmp[8:0]; // p + 2 - d  
-			sumshiftzeroE = 0;
-			shift = {~zdenormE,zman,163'b0} >> sumshiftE;
-			tE = zzeroE ? 0 : {shift[215:52]};
-			bsE = |(shift[51:0]);
-
-		end else if ($signed(aligncntE)<=$signed(13'd55))  begin // 2 < d <= p+2
-			// addend ancored case
-			// used to be 56 \/ somthing doesn't seem right too many typos
-			tmp = 13'd57-aligncntE;
-			sumshiftE = tmp[8:0]; 
-			sumshiftzeroE = 0;
-			shift = {~zdenormE,zman, 163'b0} >> sumshiftE;
-			tE = zzeroE ? 0 : {shift[215:52]};
-			bsE = |(shift[51:0]);
-
-		end else begin                 	// d >= p+3
-			// addend anchored case with saturated shift
-			sumshiftE = 0;	
-			sumshiftzeroE = 1;		
-			shift = {~zdenormE,zman, 163'b0} >> sumshiftE;
-			tE = zzeroE ? 0 : {shift[215:52]};
-			bsE = |(shift[51:0]);
-			killprodE = 1;
-
-		end 
-	end
-
-endmodule
-
--- a/wally-pipelined/src/fpu/FMA/booth.sv
+++ b/wally-pipelined/src/fpu/FMA/booth.sv
@ -1,53 +0,0 @@
-module booth(xExt, choose, add1, e, pp); 
-/////////////////////////////////////////////////////////////////////////////
-    
-	input logic 		[53:0]		xExt;				// multiplicand	xExt
-	input logic		[2:0]		choose;				// bits needed to choose which encoding
-	output logic		[1:0]       	add1;				// do you add 1	
-    output logic                  e;
-	output logic		[54:0]		pp;				//	the resultant encoding
-    
-    logic [54:0] temp;
-    logic [53:0] negx;
-    //logic temp;
-
-    assign negx = ~xExt;
-
-    always_comb
-    case (choose)
-        3'b000 : pp = 55'b0;   //  0
-        3'b001 : pp = {1'b0, xExt};  //  1
-        3'b010 : pp = {1'b0, xExt};  //  1
-        3'b011 : pp = {xExt, 1'b0};  //  2
-        3'b100 : pp = {negx, 1'b0};  // -2
-        3'b101 : pp = {1'b1, negx};  // -1
-        3'b110 : pp = {1'b1, negx};  // -1
-        3'b111 : pp = '1;  //  -0
-    endcase
-
-    always_comb
-    case (choose)
-        3'b000 : e = 0;   //  0
-        3'b001 : e = 0;  //  1
-        3'b010 : e = 0;  //  1
-        3'b011 : e = 0;  //  2
-        3'b100 : e = 1;  // -2
-        3'b101 : e = 1;  // -1
-        3'b110 : e = 1;  // -1
-        3'b111 : e = 1;  //  -0
-    endcase
-    // assign add1 = (choose[2] == 1'b1) ? ((choose[1:0] == 2'b11) ? 1'b0 : 1'b1) : 1'b0;
-    // assign add1 = choose[2];
-    always_comb
-    case (choose)
-        3'b000 : add1 = 2'b0;   //  0
-        3'b001 : add1 = 2'b0;  //  1
-        3'b010 : add1 = 2'b0;  //  1
-        3'b011 : add1 = 2'b0;  //  2
-        3'b100 : add1 = 2'b10;  // -2
-        3'b101 : add1 = 2'b1;  // -1
-        3'b110 : add1 = 2'b1;  // -1
-        3'b111 : add1 = 2'b1;  //  -0
-    endcase
-
-endmodule
--- a/wally-pipelined/src/fpu/FMA/compressors.sv
+++ b/wally-pipelined/src/fpu/FMA/compressors.sv
@ -1,90 +0,0 @@
-module add3comp2(a, b, c, carry, sum); 
-/////////////////////////////////////////////////////////////////////////////
-//look into diffrent implementations of the compressors?
-    
-    parameter BITS = 4;
-	input logic 		[BITS-1:0]		a;
-	input logic		[BITS-1:0]		b;
-	input logic		[BITS-1:0]    	c;
-    output logic      [BITS-1:0]      carry;
-	output logic		[BITS-1:0]		sum;
-    genvar i;
-
-    generate
-        for(i= 0; i<BITS; i=i+1) begin
-            sng3comp2 add0(a[i], b[i], c[i], carry[i], sum[i]);
-        end
-    endgenerate
-
-endmodule
-
-module add4comp2(a, b, c, d, carry, sum); 
-/////////////////////////////////////////////////////////////////////////////
-    
-    parameter BITS = 4;
-	input logic 		[BITS-1:0]		a;
-	input logic		[BITS-1:0]		b;
-	input logic		[BITS-1:0]    	c;
-	input logic		[BITS-1:0]    	d;
-    output logic      [BITS:0]      carry;
-	output logic		[BITS-1:0]		sum;
-
-    logic       [BITS-1:0]      cout;
-    logic                       carryTmp;
-    genvar i;
-
-
-    sng4comp2 add0(a[0], b[0], c[0], d[0], 1'b0, cout[0], carry[0], sum[0]);
-
-    generate
-        for(i= 1; i<BITS-1; i=i+1) begin
-            sng4comp2 add1(a[i], b[i], c[i], d[i], cout[i-1], cout[i], carry[i], sum[i]);
-        end
-    endgenerate
-
-
-    sng4comp2 add2(a[BITS-1], b[BITS-1], c[BITS-1], d[BITS-1], cout[BITS-2], cout[BITS-1], carryTmp, sum[BITS-1]);
-
-    assign carry[BITS-1] = carryTmp & cout[BITS-1];
-    assign carry[BITS] = carryTmp ^ cout[BITS-1];
-
-endmodule
-
-module sng3comp2(a, b, c, carry, sum); 
-/////////////////////////////////////////////////////////////////////////////
-//look into diffrent implementations of the compressors?
-    
-	input logic 				a;
-	input logic				b;
-	input logic		       	c;
-    output logic              carry;
-	output logic				sum;
-    
-    logic               axorb;
-
-    assign axorb = a ^ b;
-    assign sum = axorb ^ c;
-
-    assign carry = axorb ? c : a;
-
-endmodule
-
-module sng4comp2(a, b, c, d, cin, cout, carry, sum); 
-/////////////////////////////////////////////////////////////////////////////
-//look into pass gate 4:2 counters?
-    
-	input logic 				a;
-	input logic				b;
-	input logic		       	c;
-    input logic               d;
-    input logic               cin;
-    output logic              cout;
-    output logic              carry;
-	output logic				sum;
-    
-    logic               TmpSum;
-
-    sng3comp2 add1(.carry(cout), .sum(TmpSum),.*);
-    sng3comp2 add2(.a(TmpSum), .b(d), .c(cin), .*);
-
-endmodule
--- a/wally-pipelined/src/fpu/FMA/expgen.sv
+++ b/wally-pipelined/src/fpu/FMA/expgen.sv
@ -1,140 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	expgen.v
-// Author:		David Harris
-// Date:		11/2/1995
-//
-//   Block Description:
-//   This block implements the exponent path of the FMAC. It performs the
-//   following operations:
-//
-//   1) Compute exponent of multiply.  
-//   2) Compare multiply and add exponents to generate alignment shift count
-//   3) Adjust exponent based on normalization
-//   4)  Increment exponent based on postrounding renormalization
-//
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module expgen(xexp, yexp, zexp,
-			   killprod,  sumzero, resultdenorm, normcnt, infinity, 
-			   FmaFlagsM, inf, xzero, yzero,expplus1,
-			   nan, de0, xnan, ynan, znan, xdenorm, ydenorm, zdenorm, proddenorm, specialsel, zexpsel,
-			   aligncnt, wexp,
-			   prodof, sumof, sumuf, denorm0, ae);
-/////////////////////////////////////////////////////////////////////////////
-  
-	input     	[62:52]    	xexp;           	// Exponent of multiplicand x
-	input     	[62:52]  	yexp;         		// Exponent of multiplicand y
-	input     	[62:52]  	zexp;           	// Exponent of addend z
-	input     			killprod;    	// Z >> product
-	input     			sumzero;     	// sum exactly equals zero 
-	input     			resultdenorm;  // postnormalize rounded result
-	input     	[8:0]  		normcnt;     	// normalization shift count 
-	input     			infinity;    	// generate infinity on overflow 
-	input     	[4:0]	FmaFlagsM;     	// Result invalid
-	input     			inf;			// Some input is infinity
-	input     			nan;			// Some input is NaN
-	input     	[12:0]		de0;			// X is NaN NaN
-	input     			xnan;			// X is NaN
-	input     			ynan;			// Y is NaN
-	input     			znan;			// Z is NaN 
-	input     			xdenorm;		// Z is denorm
-	input     			ydenorm;		// Z is denorm
-	input     			zdenorm;		// Z is denorm
-	input     			xzero;		// Z is denorm
-	input     			yzero;		// Z is denorm
-	input				expplus1;
-	input     			proddenorm;		// product is denorm
-	input     			specialsel;  	// Select special result
-	input     			zexpsel;  	// Select special result
-	output		[12:0]   	aligncnt;       // shift count for alignment shifter
-	output		[62:52]    	wexp;           	// Exponent of result
-	output				prodof;         // X*Y exponent out of bounds 
-	output				sumof;          // X*Y+Z exponent out of bounds 
-	output				sumuf;         // X*Y+Z exponent underflows 
-	output				denorm0;     	// exponent = 0 for denorm 
-	output		[12:0]		ae;				//exponent of multiply
-
-	//   Internal nodes
-
-
-	wire 	[12:0]			aligncnt0;		// Shift count for alignment
-	wire 	[12:0]			aligncnt1;		// Shift count for alignment
-	wire 	[12:0]			be;				// Exponent of multiply
-	wire 	[12:0]			de1;			// Normalized exponent
-	wire 	[12:0]			de;				// Normalized exponent
-	wire 	[10:0]			infinityres;	// Infinity or max number
-	wire 	[10:0]			nanres;          //	Nan propagated or generated
-	wire 	[10:0]			specialres;  //	Exceptional case result
-
-	//   Compute exponent of multiply
-	// Note that the exponent does not have to be incremented on a postrounding
-	//   normalization of X because the mantissa was already increased.   Report
-	//   if exponent is out of bounds 
-
-
-	assign ae = xzero|yzero ? 0 : xexp + yexp -1023;
-
-	assign prodof = (ae > 2046 && ~ae[12]);
-
-	// Compute alignment shift count
-	// Adjust for postrounding normalization of Z.
-	// This should not increas the critical path because the time to
-	// check if a round overflows is shorter than the actual round and
-	// is masked by the bypass mux and two 10 bit adder delays.
-	assign aligncnt0 = - 1 + ~xdenorm + ~ydenorm - ~zdenorm;
-	assign aligncnt1 = - 1 + {12'b0,~xdenorm} + {12'b0,~ydenorm} - {12'b0,~zdenorm};
-	assign aligncnt = zexp -ae - 1 + {12'b0,~xdenorm} + {12'b0,~ydenorm} - {12'b0,~zdenorm};
-	//assign aligncnt = zexp -ae - 1 + ~xdenorm + ~ydenorm - ~zdenorm;
-	//assign aligncnt = zexp - ae;// KEP use all of ae
-
-	// Select exponent (usually from product except in case of huge addend)
-
-	//assign be = zexpsel ? zexp : ae;
-
-	// Adjust exponent based on normalization
-	// A compound adder takes care of the case of post-rounding normalization
-	// requiring an extra increment
-	 
-	//assign de0 = sumzero ? 13'b0 : be + normcnt + 2;
-	// assign de1 = sumzero ? 13'b0 : be + normcnt + 2;
-	 
-	// If the exponent becomes exactly zero (denormalized)
-	// signal such to adjust R bit before rounding
-
-	assign denorm0 = (de0 == 0);
-	
-	// check for exponent out of bounds after add 
-	
-	assign de = resultdenorm | sumzero ? 0 : de0;
-	assign sumof = ~de[12] && de > 2046;
-	assign sumuf = de == 0  && ~sumzero && ~resultdenorm;
-
-	// bypass occurs before rounding or taking early results 
-	
-	//assign wbypass = de0[10:0];
-	
-	// In a non-critical special mux, we combine the early result from other
-	// FPU blocks with the results of exceptional conditions.  Overflow
-	// produces either infinity or the largest finite number, depending on the
-	// rounding mode.  NaNs are propagated or generated.
-
-	assign specialres = FmaFlagsM[4] | nan ? nanres : // invalid
-					FmaFlagsM[2] ? infinityres : 	//overflow
-					inf ? 11'b11111111111 :
-					FmaFlagsM[1] ? 11'b0 : 11'bx; //underflow
-
-	assign infinityres = infinity ? 11'b11111111111 : 11'b11111111110;
-
-	// IEEE 754-2008 section 6.2.3 states:
-	// "If two or more inputs are NaN, then the payload of the resulting NaN should be 
-	// identical to the payload of one of the input NaNs if representable in the destination
-	// format. This standard does not specify which of the input NaNs will provide the payload."
-	assign nanres = xnan ? xexp : (ynan ? yexp : (znan? zexp : 11'b11111111111));
-
-	// A mux selects the early result from other FPU blocks or the 
-	// normalized FMAC result.   Special cases are also detected. 
-	
-	assign wexp = specialsel ? specialres[10:0] : de[10:0] + expplus1; 
-endmodule
-
--- a/wally-pipelined/src/fpu/FMA/expgen1.sv
+++ b/wally-pipelined/src/fpu/FMA/expgen1.sv
@ -1,90 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	expgen.v
-// Author:		David Harris
-// Date:		11/2/1995
-//
-//   Block Description:
-//   This block implements the exponent path of the FMAC. It performs the
-//   following operations:
-//
-//   1) Compute exponent of multiply.  
-//   2) Compare multiply and add exponents to generate alignment shift count
-//   3) Adjust exponent based on normalization
-//   4)  Increment exponent based on postrounding renormalization
-//
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module expgen1(xexp, yexp, zexp, xzeroE, yzeroE,
-			   xdenormE, ydenormE, zdenormE, 
-			   aligncntE, prodof, aeE);
-/////////////////////////////////////////////////////////////////////////////
-  
-	input logic     	[62:52]    	xexp;           	// Exponent of multiplicand x
-	input logic     	[62:52]  	yexp;         		// Exponent of multiplicand y
-	input logic     	[62:52]  	zexp;           	// Exponent of addend z
-	input logic     			xdenormE;		// Z is denorm
-	input logic     			ydenormE;		// Z is denorm
-	input logic     			zdenormE;		// Z is denorm
-	input logic     			xzeroE;		// Z is denorm
-	input logic     			yzeroE;		// Z is denorm
-	output logic		[12:0]   	aligncntE;       // shift count for alignment shifter
-	output logic			prodof;         // X*Y exponent out of bounds 
-	output logic		[12:0]		aeE;				//exponent of multiply
-
-	//   Internal nodes
-
-
-	wire 	[12:0]			aligncnt0;		// Shift count for alignment
-	wire 	[12:0]			aligncnt1;		// Shift count for alignment
-	wire 	[12:0]			be;				// Exponent of multiply
-	wire 	[12:0]			de1;			// Normalized exponent
-	wire 	[12:0]			de;				// Normalized exponent
-	wire 	[10:0]			infinityres;	// Infinity or max number
-	wire 	[10:0]			nanres;          //	Nan propagated or generated
-	wire 	[10:0]			specialres;  //	Exceptional case result
-
-	//   Compute exponent of multiply
-	// Note that the exponent does not have to be incremented on a postrounding
-	//   normalization of X because the mantissa was already increased.   Report
-	//   if exponent is out of bounds 
-
-
-	assign aeE = xzeroE|yzeroE ? 0 : {2'b0,xexp} + {2'b0,yexp} - 13'd1023;
-
-	assign prodof = (aeE > 2046 && ~aeE[12]);
-
-	// Compute alignment shift count
-	// Adjust for postrounding normalization of Z.
-	// This should not increas the critical path because the time to
-	// check if a round overflows is shorter than the actual round and
-	// is masked by the bypass mux and two 10 bit adder delays.
-	// assign aligncnt0 = - 1 + ~xdenormE + ~ydenormE - ~zdenormE;
-	// assign aligncnt1 = - 1 + {12'b0,~xdenormE} + {12'b0,~ydenormE} - {12'b0,~zdenormE};
-	assign aligncntE = {2'b0,zexp} -aeE - 1 + {12'b0,~xdenormE} + {12'b0,~ydenormE} - {12'b0,~zdenormE};
-	//assign aligncntE = zexp -aeE - 1 + ~xdenormE + ~ydenormE - ~zdenormE;
-	//assign aligncntE = zexp - aeE;// KEP use all of aeE
-
-	// Select exponent (usually from product except in case of huge addend)
-
-	//assign be = zexpsel ? zexp : aeE;
-
-	// Adjust exponent based on normalization
-	// A compound adder takes care of the case of post-rounding normalization
-	// requiring an extra increment
-	 
-	//assign de0 = sumzero ? 13'b0 : be + normcnt + 2;
-	// assign de1 = sumzero ? 13'b0 : be + normcnt + 2;
-	 
-
-	// bypass occurs before rounding or taking early results 
-	
-	//assign wbypass = de0[10:0];
-	
-	// In a non-critical special mux, we combine the early result from other
-	// FPU blocks with the results of exceptional conditions.  Overflow
-	// produces either infinity or the largest finite number, depending on the
-	// rounding mode.  NaNs are propagated or generated.
-endmodule
-
-
--- a/wally-pipelined/src/fpu/FMA/expgen2.sv
+++ b/wally-pipelined/src/fpu/FMA/expgen2.sv
@ -1,108 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	expgen.v
-// Author:		David Harris
-// Date:		11/2/1995
-//
-//   Block Description:
-//   This block implements the exponent path of the FMAC. It performs the
-//   following operations:
-//
-//   1) Compute exponent of multiply.  
-//   2) Compare multiply and add exponents to generate alignment shift count
-//   3) Adjust exponent based on normalization
-//   4)  Increment exponent based on postrounding renormalization
-//
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module expgen2(xexp, yexp, zexp,
-			   sumzero, resultdenorm, infinity, 
-			   FmaFlagsM, inf, expplus1,
-			   nanM, de0, xnanM, ynanM, znanM,  specialsel,
-			    wexp,
-			   sumof, sumuf);
-/////////////////////////////////////////////////////////////////////////////
-  
-	input logic     	[62:52]    	xexp;           	// Exponent of multiplicand x
-	input logic     	[62:52]  	yexp;         		// Exponent of multiplicand y
-	input logic     	[62:52]  	zexp;           	// Exponent of addend z
-	input logic     			sumzero;     	// sum exactly equals zero 
-	input logic     			resultdenorm;  // postnormalize rounded result
-	input logic     			infinity;    	// generate infinity on overflow 
-	input logic     	[4:0]	FmaFlagsM;     	// Result invalid
-	input logic     			inf;			// Some input is infinity
-	input logic     			nanM;			// Some input is NaN
-	input logic     	[12:0]		de0;			// X is NaN NaN
-	input logic     			xnanM;			// X is NaN
-	input logic    			ynanM;			// Y is NaN
-	input logic     			znanM;			// Z is NaN 
-	input logic				expplus1;
-	input logic     			specialsel;  	// Select special result
-	output logic		[62:52]    	wexp;           	// Exponent of result
-	output logic				sumof;          // X*Y+Z exponent out of bounds 
-	output logic				sumuf;         // X*Y+Z exponent underflows 
-
-	//   Internal nodes
-
-
-	wire 	[12:0]			aligncnt0;		// Shift count for alignment
-	wire 	[12:0]			aligncnt1;		// Shift count for alignment
-	wire 	[12:0]			be;				// Exponent of multiply
-	wire 	[12:0]			de1;			// Normalized exponent
-	wire 	[12:0]			de;				// Normalized exponent
-	wire 	[10:0]			infinityres;	// Infinity or max number
-	wire 	[10:0]			nanres;          //	Nan propagated or generated
-	wire 	[10:0]			specialres;  //	Exceptional case result
-
-	//   Compute exponent of multiply
-	// Note that the exponent does not have to be incremented on a postrounding
-	//   normalization of X because the mantissa was already increased.   Report
-	//   if exponent is out of bounds 
-
-	// Select exponent (usually from product except in case of huge addend)
-
-	//assign be = zexpsel ? zexp : ae;
-
-	// Adjust exponent based on normalization
-	// A compound adder takes care of the case of post-rounding normalization
-	// requiring an extra increment
-	 
-	//assign de0 = sumzero ? 13'b0 : be + normcnt + 2;
-	// assign de1 = sumzero ? 13'b0 : be + normcnt + 2;
-	 
-	
-	// check for exponent out of bounds after add 
-	
-	assign de = resultdenorm | sumzero ? 0 : de0;
-	assign sumof = ~de[12] && de > 2046;
-	assign sumuf = de == 0  && ~sumzero && ~resultdenorm;
-
-	// bypass occurs before rounding or taking early results 
-	
-	//assign wbypass = de0[10:0];
-	
-	// In a non-critical special mux, we combine the early result from other
-	// FPU blocks with the results of exceptional conditions.  Overflow
-	// produces either infinity or the largest finite number, depending on the
-	// rounding mode.  NaNs are propagated or generated.
-
-	assign specialres = FmaFlagsM[4] | nanM ? nanres : // invalid
-					FmaFlagsM[2] ? infinityres : 	//overflow
-					inf ? 11'b11111111111 :
-					FmaFlagsM[1] ? 11'b0 : 11'bx; //underflow
-
-	assign infinityres = infinity ? 11'b11111111111 : 11'b11111111110;
-
-	// IEEE 754-2008 section 6.2.3 states:
-	// "If two or more inputs are NaN, then the payload of the resulting NaN should be 
-	// identical to the payload of one of the input NaNs if representable in the destination
-	// format. This standard does not specify which of the input NaNs will provide the payload."
-	assign nanres = xnanM ? xexp : (ynanM ? yexp : (znanM? zexp : 11'b11111111111));
-
-	// A mux selects the early result from other FPU blocks or the 
-	// normalized FMAC result.   Special cases are also detected. 
-	
-	assign wexp = specialsel ? specialres[10:0] : de[10:0] + {10'b0,expplus1}; 
-endmodule
-
-
--- a/wally-pipelined/src/fpu/FMA/flag.sv
+++ b/wally-pipelined/src/fpu/FMA/flag.sv
@ -1,88 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	flag.v
-// Author:		David Harris
-// Date:		12/6/1995
-//
-// Block Description:
-//       This block generates the flags: invalid, overflow, underflow, inexact. 
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module flag(xnan, ynan, znan, xinf, yinf, zinf, prodof, sumof, sumuf,
-			 psign,  zsign, xzero, yzero, zzero, vbits, killprod,
-			 inf, nan, FmaFlagsM,sticky);
-/////////////////////////////////////////////////////////////////////////////
-
-	input                  		xnan;        	// X is NaN 
-	input                  		ynan;        	// Y is NaN 
-	input                 		znan;       	// Z is NaN 
-	input                  		sticky;        	// X is Inf
-	input                  		xinf;        	// X is Inf
-	input                 		yinf;       	// Y is Inf 
-	input                  		zinf;        	// Z is Inf
-	input                  		prodof;         // X*Y overflows exponent
-	input                  		sumof;          // X*Y + z underflows exponent
-	input                  		sumuf;          // X*Y + z underflows exponent
-	input				psign; 		// Sign of product
-	input				zsign; 		// Sign of z
-	input				xzero;		// x = 0
-	input				yzero;		// y = 0
-	input				zzero;		// y = 0
-	input				killprod;
-	input     	[1:0]  		vbits;		// R and S bits of result
-	output				inf;		// Some	source is Inf
-	output				nan;		// Some	source is NaN
-	output		[4:0]	FmaFlagsM;
- 
-	//   Internal nodes
-
-	wire				prodinf;	// X*Y larger than max possible
-	wire				suminf;		// X*Y+Z larger than max possible
-
-	// If any input is NaN, propagate the NaN 
-
-	assign nan = xnan || ynan || znan;
-
-	// Same with infinity (inf - inf and O * inf don't propagate inf
-	//  but it's ok becaue illegal op takes higher precidence)
-
-	assign inf= xinf || yinf || zinf || suminf;//KEP added suminf 
-	//assign inf= xinf || yinf || zinf;//original
-
-	// Generate infinity checks
-
-	assign prodinf = prodof && ~xnan && ~ynan;
-	//KEP added if the product is infinity then sum is infinity
-	assign suminf = sumof && ~xnan && ~ynan && ~znan;
-
-	// Set invalid flag for following cases:
-	//   1) Inf - Inf
-	//   2) 0 * Inf
-	//   3) Output = NaN (this is not part of the IEEE spec,  only 486 proj)
-
-	assign FmaFlagsM[4] = (xinf || yinf || prodinf) && zinf && (psign ^ zsign) ||
-					   xzero && yinf || yzero && xinf;// KEP remove case 3) above
-
-	assign FmaFlagsM[3] = 0; // divide by zero flag
-
-
-	// Set the overflow flag for the following cases:
-	//   1) Rounded multiply result would be out of bounds
-	//   2) Rounded add result would be out of bounds
-
-	assign FmaFlagsM[2] = suminf && ~inf;
-
-	// Set the underflow  flag for the following cases:
-	//   1) Any input is denormalized
-	//   2)  Output would be denormalized or smaller
-
-	assign FmaFlagsM[1] = (sumuf && ~inf && ~prodinf && ~nan) || (killprod & zzero & ~(yzero | xzero));
-
-	// Set the inexact flag for the following cases:
-	//   1) Multiplication inexact
-	//   2) Addition  inexact
-	// One of these cases occurred if the R or S bit is set
-
-	assign FmaFlagsM[0] = (vbits[0] || vbits[1] ||sticky  || suminf) && ~(inf || nan);
-
-endmodule
--- a/wally-pipelined/src/fpu/FMA/flag1.sv
+++ b/wally-pipelined/src/fpu/FMA/flag1.sv
@ -1,34 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	flag.v
-// Author:		David Harris
-// Date:		12/6/1995
-//
-// Block Description:
-//       This block generates the flags: invalid, overflow, underflow, inexact. 
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module flag1(xnanE, ynanE, znanE, prodof, prodinfE, nanE);
-/////////////////////////////////////////////////////////////////////////////
-
-	input logic                  		xnanE;        	// X is NaN 
-	input logic                  		ynanE;        	// Y is NaN 
-	input logic                 		znanE;       	// Z is NaN
-	input logic                  		prodof;         // X*Y overflows exponent
-	output logic				nanE;		// Some	source is NaN
- 
-	//   Internal nodes
-
-	output logic				prodinfE;	// X*Y larger than max possible
-
-	// If any input logic is NaN, propagate the NaN 
-
-	assign nanE = xnanE || ynanE || znanE;
-
-
-	// Generate infinity checks
-
-	assign prodinfE = prodof && ~xnanE && ~ynanE;
-
-
-endmodule
--- a/wally-pipelined/src/fpu/FMA/flag2.sv
+++ b/wally-pipelined/src/fpu/FMA/flag2.sv
@ -1,80 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	flag.v
-// Author:		David Harris
-// Date:		12/6/1995
-//
-// Block Description:
-//       This block generates the flags: invalid, overflow, underflow, inexact. 
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module flag2(xsign,ysign,zsign, xnanM, ynanM, znanM, xinfM, yinfM, zinfM, sumof, sumuf,
-			 xzeroM, yzeroM, zzeroM, vbits, killprodM,
-			 inf, nanM, FmaFlagsM,sticky,prodinfM);
-/////////////////////////////////////////////////////////////////////////////
-
-	input logic                  		xnanM;        	// X is NaN 
-	input logic                  		ynanM;        	// Y is NaN 
-	input logic                 		znanM;       	// Z is NaN 
-	input logic				xsign; 		// Sign of z
-	input logic				ysign; 		// Sign of z
-	input logic				zsign; 		// Sign of z
-	input logic                  		sticky;        	// X is Inf
-    input logic                       prodinfM;
-	input logic                  		xinfM;        	// X is Inf
-	input logic                 		yinfM;       	// Y is Inf 
-	input logic                  		zinfM;        	// Z is Inf
-	input logic                  		sumof;          // X*Y + z underflows exponent
-	input logic                  		sumuf;          // X*Y + z underflows exponent
-	input logic				xzeroM;		// x = 0
-	input logic				yzeroM;		// y = 0
-	input logic				zzeroM;		// y = 0
-	input logic				killprodM;
-	input logic     	[1:0]  		vbits;		// R and S bits of result
-	output logic				inf;		// Some	source is Inf
-	input logic				nanM;		// Some	source is NaN
-	output logic		[4:0]	FmaFlagsM;
- 
-	//   Internal nodes
-
-logic suminf;
-
-	// Same with infinity (inf - inf and O * inf don't propagate inf
-	//  but it's ok becaue illegal op takes higher precidence)
-
-	assign inf= xinfM || yinfM || zinfM || suminf;//KEP added suminf 
-	//assign inf= xinfM || yinfM || zinfM;//original
-
-	assign suminf = sumof && ~xnanM && ~ynanM && ~znanM;
-
-
-	// Set the overflow flag for the following cases:
-	//   1) Rounded multiply result would be out of bounds
-	//   2) Rounded add result would be out of bounds
-
-	assign FmaFlagsM[2] = suminf && ~inf;
-
-	// Set the underflow  flag for the following cases:
-	//   1) Any input logic is denormalized
-	//   2)  output logic would be denormalized or smaller
-
-	assign FmaFlagsM[1] = (sumuf && ~inf && ~prodinfM && ~nanM) || (killprodM & zzeroM & ~(yzeroM | xzeroM));
-
-	// Set the inexact flag for the following cases:
-	//   1) Multiplication inexact
-	//   2) Addition  inexact
-	// One of these cases occurred if the R or S bit is set
-
-	assign FmaFlagsM[0] = (vbits[0] || vbits[1] ||sticky  || suminf) && ~(inf || nanM);
-
-	// Set invalid flag for following cases:
-	//   1) Inf - Inf
-	//   2) 0 * Inf
-	//   3) output logic = NaN (this is not part of the IEEE spec,  only 486 proj)
-
-	assign FmaFlagsM[4] = (xinfM || yinfM || prodinfM) && zinfM && (xsign ^ ysign ^ zsign) ||
-					   xzeroM && yinfM || yzeroM && xinfM;// KEP remove case 3) above
-
-	assign FmaFlagsM[3] = 0; // divide by zero flag
-
-endmodule
--- a/wally-pipelined/src/fpu/FMA/fma.sv
+++ b/wally-pipelined/src/fpu/FMA/fma.sv
@ -1,132 +0,0 @@
- ////////////////////////////////////////////////////////////////////////////////
-// Block Name:	fmac.v
-// Author:		David Harris
-// Date:		11/2/1995
-//
-// Block Description:
-//   This is the top level block of a floating-point  multiply/accumulate
-//   unit(FMAC).   It instantiates the following sub-blocks:
-//
-//    array     Booth encoding, partial product generation, product summation
-//    expgen    Exponent summation, compare, and adjust
-//    align     Alignment shifter
-//    add       Carry-save adder for accumulate, carry propagate adder
-//    lza       Leading zero anticipator to control normalization shifter
-//    normalize Normalization shifter
-//    round     Rounding of result
-//    exception Handles exceptional cases
-//    bypass    Handles bypass of result to ReadData1E or ReadData3E inputs
-//    sign      One bit sign handling block 
-//    special   Catch special cases (inputs = 0  / infinity /  etc.) 
-//
-//   The FMAC computes FmaResultM=ReadData1E*ReadData2E+ReadData3E, rounded with the mode specified by
-//   RN, RZ, RM, or RP.  The result is optionally bypassed back to
-//   the ReadData1E or ReadData3E inputs for use on the next cycle.  In addition,  four signals
-//   are produced: trap, overflow, underflow, and inexact.  Trap indicates
-//   an infinity, NaN, or denormalized number to be handled in software;
-//   the other three signals are IEEE flags.
-//
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module fma(ReadData1E, ReadData2E, ReadData3E, FrmE,
-			FmaResultM, FmaFlagsM, aligncnt);
-/////////////////////////////////////////////////////////////////////////////
- 
-	input 		[63:0]		ReadData1E;		// input 1
-	input		[63:0]		ReadData2E;     // input 2 
-	input 		[63:0]		ReadData3E;     // input 3
-	input 		[2:0]	 	FrmE;          	// Rounding mode
-	output 		[63:0]		FmaResultM;     // output FmaResultM=ReadData1E*ReadData2E+ReadData3E
-	output 		[4:0]		FmaFlagsM;    	// status flags
-	output 		[12:0]		aligncnt;    	// status flags
-
-// Internal nodes
- 
-	logic 		[105:0]		r; 				// one result of partial product sum
-	logic 		[105:0]		s; 				// other result of partial products
-	logic 		[163:0]		t;				// output of alignment shifter
-	logic 		[163:0]		sum;			// output of carry prop adder
-	logic 		[53:0]		v; 				// normalized sum, R, S bits
-//	logic 		[12:0]		aligncnt; 		// shift count for alignment
-	logic 		[8:0]		normcnt; 		// shift count for normalizer
-	logic 		[12:0]		ae; 		// multiplier expoent
-	logic 					bs;				// sticky bit of addend
-	logic 					ps;				// sticky bit of product
-	logic 					killprod; 		// ReadData3E >> product
-	logic 					negsum; 		// negate sum
-	logic 					invz; 			// invert addend
-	logic 					selsum1; 		// select +1 mode of sum
-	logic 					negsum0; 		// sum +0 < 0
-	logic 					negsum1; 		// sum +1 < 0
-	logic 					sumzero; 		// sum = 0
-	logic 					infinity; 		// generate infinity on overflow
-	logic 					prodof; 		// ReadData1E*ReadData2E out of range
-	logic 					sumof;			// result out of range
-	logic					xzero;
-	logic					yzero;
-	logic					zzero;
-	logic					xdenorm;
-	logic					ydenorm;
-	logic					zdenorm;
-	logic					proddenorm;
-	logic					zexpsel;
-	logic					denorm0;
-	logic					resultdenorm;
-	logic					inf;
-	logic					xinf;
-	logic					yinf;
-	logic					zinf;
-	logic					xnan;
-	logic					ynan;
-	logic					znan;
-	logic					specialsel;
-	logic					expplus1;
-	logic					nan;
-	logic					sumuf;
-	logic					psign;
-	logic					sticky;
-	logic			[8:0]		sumshift;
-	logic					sumshiftzero;
-	logic			[12:0]		de0;
-	logic					isAdd;
-
-	assign isAdd = 1;
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-//   Instantiate fraction datapath
-
-	multiply		multiply(.xman(ReadData1E[51:0]), .yman(ReadData2E[51:0]), .*);
-	align			align(.zman(ReadData3E[51:0]),.*);
-	add				add(.*);
-	lza				lza(.*);
-	normalize		normalize(.zexp(ReadData3E[62:52]),.*); 
-	round			round(.xman(ReadData1E[51:0]), .yman(ReadData2E[51:0]),.zman(ReadData3E[51:0]), .wman(FmaResultM[51:0]),.wsign(FmaResultM[63]),.*);
-
-// Instantiate exponent datapath
-
-	expgen			expgen(.xexp(ReadData1E[62:52]),.yexp(ReadData2E[62:52]),.zexp(ReadData3E[62:52]),.wexp(FmaResultM[62:52]),.*);
-// Instantiate special case detection across datapath & exponent path 
-
-	special			special(.*);
-
-
-// Instantiate control logic
- 
-sign				sign(.xsign(ReadData1E[63]),.ysign(ReadData2E[63]),.zsign(ReadData3E[63]),.wsign(FmaResultM[63]),.*); 
-flag				flag(.zsign(ReadData3E[63]),.vbits(v[1:0]),.*); 
-
-endmodule
-
--- a/wally-pipelined/src/fpu/FMA/fma1.sv
+++ b/wally-pipelined/src/fpu/FMA/fma1.sv
@ -1,165 +0,0 @@
-module fma1(
- 
-	input logic 	[63:0]		FInput1E,	// X
-	input logic		[63:0]		FInput2E,	// Y
-	input logic 	[63:0]		FInput3E,	// Z
-	input logic 	[2:0]		FOpCtrlE,	// 000 = fmadd (X*Y)+Z,  001 = fmsub (X*Y)-Z,  010 = fnmsub -(X*Y)+Z,  011 = fnmadd -(X*Y)-Z,  100 = fmul (X*Y)
-	input logic 				FmtE,		// precision 1 = double 0 = single
-	output logic 	[105:0]		ProdManE,	// 1.X frac * 1.Y frac
-	output logic 	[161:0]		AlignedAddendE,	// Z aligned for addition
-	output logic 	[12:0]		ProdExpE,		// X exponent + Y exponent - bias
-	output logic 				AddendStickyE,	// sticky bit that is calculated during alignment
-	output logic 				KillProdE,		// set the product to zero before addition if the product is too small to matter
-	output logic				XZeroE, YZeroE, ZZeroE, // inputs are zero
-	output logic				XInfE, YInfE, ZInfE,	// inputs are infinity
-	output logic				XNaNE, YNaNE, ZNaNE);	// inputs are NaN
-
-	logic [51:0] 	XFrac,YFrac,ZFrac;	// input fraction
-	logic [52:0] 	XMan,YMan,ZMan;		// input mantissa (with leading one)
-	logic [12:0] 	XExp,YExp,ZExp;		// input exponents
-	logic 		 	XSgn,YSgn,ZSgn;		// input signs
-	logic [12:0]	AlignCnt;			// how far to shift the addend to align with the product
-	logic [211:0] 	Shift;				// output of the alignment shifter including sticky bit
-	logic			XDenormE, YDenormE, ZDenormE;	// inputs are denormal
-	logic [63:0]	FInput3E2;	// value to add (Z or zero)
-	logic [12:0]	Bias;	// 1023 for double, 127 for single
-	logic 			XExpZero, YExpZero, ZExpZero; 	// input exponent zero
-	logic 			XFracZero, YFracZero, ZFracZero; // input fraction zero
-	logic 			XExpMax, YExpMax, ZExpMax; 	// input exponent all 1s
-
-	// Set addend to zero if FMUL instruction
-  	assign FInput3E2 = FOpCtrlE[2] ? 64'b0 : FInput3E;
-
-	// split inputs into the sign bit, fraction, and exponent and handle single or double precision
-	// 		- single precision is in the top half of the inputs
-	assign XSgn = FInput1E[63];
-	assign YSgn = FInput2E[63];
-	assign ZSgn = FInput3E2[63];
-
-	assign XExp = FmtE ? {2'b0, FInput1E[62:52]} : {5'b0, FInput1E[62:55]};
-	assign YExp = FmtE ? {2'b0, FInput2E[62:52]} : {5'b0, FInput2E[62:55]};
-	assign ZExp = FmtE ? {2'b0, FInput3E2[62:52]} : {5'b0, FInput3E2[62:55]};
-
-	assign XFrac = FmtE ? FInput1E[51:0] : {FInput1E[54:32], 29'b0};
-	assign YFrac = FmtE ? FInput2E[51:0] : {FInput2E[54:32], 29'b0};
-	assign ZFrac = FmtE ? FInput3E2[51:0] : {FInput3E2[54:32], 29'b0};
-	
-	assign XMan = {~XExpZero, XFrac};
-	assign YMan = {~YExpZero, YFrac};
-	assign ZMan = {~ZExpZero, ZFrac};
-
-	assign Bias = FmtE ? 13'h3ff : 13'h7f;
-
-
-
-	// determine if an input is a special value
-	assign XExpZero = ~|XExp;
-	assign YExpZero = ~|YExp;
-	assign ZExpZero = ~|ZExp;
-	
-	assign XFracZero = ~|XFrac;
-	assign YFracZero = ~|YFrac;
-	assign ZFracZero = ~|ZFrac;
-
-	assign XExpMax = FmtE ? &XExp[10:0] : &XExp[7:0];
-	assign YExpMax = FmtE ? &YExp[10:0] : &YExp[7:0];
-	assign ZExpMax = FmtE ? &ZExp[10:0] : &ZExp[7:0];
-	
-	assign XNaNE = XExpMax & ~XFracZero;
-	assign YNaNE = YExpMax & ~YFracZero;
-	assign ZNaNE = ZExpMax & ~ZFracZero;
-
-	assign XDenormE = XExpZero & ~XFracZero; 
-	assign YDenormE = YExpZero & ~YFracZero; 
-	assign ZDenormE = ZExpZero & ~ZFracZero; 
-
-	assign XInfE = XExpMax & XFracZero; 
-	assign YInfE = YExpMax & YFracZero; 
-	assign ZInfE = ZExpMax & ZFracZero; 
-
-	assign XZeroE = XExpZero & XFracZero;
-	assign YZeroE = YExpZero & YFracZero;
-	assign ZZeroE = ZExpZero & ZFracZero;
-
-
-
-
-	// Calculate the product's exponent
-	//		- When multipliying two fp numbers, add the exponents
-	// 		- Subtract the bias (XExp + YExp has two biases, one from each exponent)
-	//		- Denormal numbers have an an exponent value of 1, however they are 
-	//		  represented with an exponent of 0. add one if there is a denormal number
-	assign ProdExpE = (XZeroE|YZeroE) ? 13'b0 : 
-				 XExp + YExp - Bias + XDenormE + YDenormE;
-
-	// Calculate the product's mantissa
-	//		- Add the assumed one. If the number is denormalized or zero, it does not have an assumed one.
-	assign ProdManE =  XMan * YMan;
-
-
-
-	// determine the shift count for alignment
-	//		- negitive means Z is larger, so shift Z left
-	//		- positive means the product is larger, so shift Z right
-	//		- Denormal numbers have an an exponent value of 1, however they are 
-	//		  represented with an exponent of 0. add one to the exponent if it is a denormal number
-	assign AlignCnt = ProdExpE - ZExp - ZDenormE;
-
-	// Alignment shifter
-
-	// Defualt Addition without shifting
-	// 			| 	55'b0	 |	106'b(product)	| 2'b0 |
-	//						 |1'b0| addnend |
-
-	// the 1'b0 before the added is because the product's mantissa has two bits before the binary point (xx.xxxxxxxxxx...)
-	
-	always_comb 
-		begin
-			
-		// Set default values
-		AddendStickyE = 0;
-		KillProdE = 0;
-		
-		// If the product is too small to effect the sum, kill the product
-
-		// 			| 	55'b0	 |	106'b(product)	| 2'b0 |
-		//	| addnend |
-		if ($signed(AlignCnt) <= $signed(-13'd56)) begin
-			KillProdE = 1;
-			AlignedAddendE = {107'b0, ZMan,2'b0};
-			AddendStickyE = ~(XZeroE|YZeroE);
-
-		// If the Addend is shifted left (negitive AlignCnt)
-
-		// 			| 	55'b0	 |	106'b(product)	| 2'b0 |
-		//					| addnend |
-		end else if($signed(AlignCnt) <= $signed(13'd0))  begin
-			Shift = {55'b0, ZMan, 104'b0} << -AlignCnt;
-			AlignedAddendE = Shift[211:50];
-			AddendStickyE = |(Shift[49:0]);
-
-		// If the Addend is shifted right (positive AlignCnt)
-
-		// 			| 	55'b0	 |	106'b(product)	| 2'b0 |
-		//									| addnend |
-		end else if ($signed(AlignCnt)<=$signed(13'd105))  begin
-			Shift = {55'b0, ZMan, 104'b0} >> AlignCnt;
-			AlignedAddendE = Shift[211:50];
-			AddendStickyE = |(Shift[49:0]);
-
-		// If the addend is too small to effect the addition		
-		//		- The addend has to shift two past the end of the addend to be considered too small
-		//		- The 2 extra bits are needed for rounding
-
-		// 			| 	55'b0	 |	106'b(product)	| 2'b0 |
-		//														| addnend |
-		end else begin
-			AlignedAddendE = 162'b0;
-			AddendStickyE = ~ZZeroE;
-
-
-		end 
-	end
-
-endmodule
-
--- a/wally-pipelined/src/fpu/FMA/fma2.sv
+++ b/wally-pipelined/src/fpu/FMA/fma2.sv
@ -1,282 +0,0 @@
-module fma2(
- 
-	input logic 	[63:0]		FInput1M,
-	input logic		[63:0]		FInput2M,
-	input logic 	[63:0]		FInput3M,
-	input logic 	[2:0] 		FrmM,
-	input logic 	[105:0]		ProdManM,
-	input logic 	[161:0]		AlignedAddendM,	
-	input logic 	[12:0]		ProdExpM,
-	input logic 				FmtM,
-	input logic 				AddendStickyM,
-	input logic 				KillProdM,
-	input logic 	[2:0]		FOpCtrlM,
-	input logic					XZeroM, YZeroM, ZZeroM,
-	input logic					XInfM, YInfM, ZInfM,
-	input logic					XNaNM, YNaNM, ZNaNM,
-	output logic	[63:0]		FmaResultM,
-	output logic 	[4:0]		FmaFlagsM);
-	
-
-
-	logic [51:0] 	XMan, YMan, ZMan, WMan;
-	logic [10:0] 	XExp, YExp, ZExp, WExp;
-	logic 		 	XSgn, YSgn, ZSgn, WSgn, PSgn;
-	logic [105:0]	ProdMan2;
-	logic [162:0]	AlignedAddend2;
- 	logic [161:0]	Sum;
-	logic [162:0]	SumTmp;
-	logic [12:0]	SumExp;
-	logic [12:0]	SumExpMinus1;
-	logic [12:0]	SumExpTmp, SumExpTmpMinus1, WExpTmp;
-	logic [53:0]	NormSum;
-	logic [161:0]	NormSumTmp;
-	logic [8:0]		NormCnt;
-	logic 			NormSumSticky;
-	logic 			SumZero;
-	logic 			NegSum;
-	logic 			InvZ;
-	logic			ResultDenorm;
-	logic			Sticky;
-	logic 			Plus1, Minus1, Plus1Tmp, Minus1Tmp;
-	logic 			Invalid,Underflow,Overflow,Inexact;
-	logic [8:0]		DenormShift;
-	logic 			ProdInf, ProdOf, ProdUf;
-	logic [63:0]	FmaResultTmp;
-	logic 			SubBySmallNum;
-	logic [63:0]	FInput3M2;
-	logic			ZeroSgn, ResultSgn;
-
-	// Set addend to zero if FMUL instruction
-  	assign FInput3M2 = FOpCtrlM[2] ? 64'b0 : FInput3M;
-
-	// split inputs into the sign bit, mantissa, and exponent for readability
-	
-	assign XSgn = FInput1M[63];
-	assign YSgn = FInput2M[63];
-	assign ZSgn = FInput3M2[63]^FOpCtrlM[0]; //Negate Z if subtraction
-
-	assign XExp = FmtM ? FInput1M[62:52] : {3'b0, FInput1M[62:55]};
-	assign YExp = FmtM ? FInput2M[62:52] : {3'b0, FInput2M[62:55]};
-	assign ZExp = FmtM ? FInput3M2[62:52] : {3'b0, FInput3M2[62:55]};
-
-	assign XMan = FmtM ? FInput1M[51:0] : {FInput1M[54:32], 29'b0};
-	assign YMan = FmtM ? FInput2M[51:0] : {FInput2M[54:32], 29'b0};
-	assign ZMan = FmtM ? FInput3M2[51:0] : {FInput3M2[54:32], 29'b0};
-
-
-
-	// Calculate the product's sign
-	//		Negate product's sign if FNMADD or FNMSUB
-	assign PSgn = XSgn ^ YSgn ^ FOpCtrlM[1];
-
-
-
-
-	// Addition
-	
-	// Negate Z  when doing one of the following opperations:
-	//		-prod +  Z
-	//		 prod -  Z 
-	assign InvZ = ZSgn ^ PSgn;
-
-	// Choose an inverted or non-inverted addend - the one is added later
-	assign AlignedAddend2 = InvZ ? ~{1'b0,AlignedAddendM} : {1'b0,AlignedAddendM};
-	// Kill the product if the product is too small to effect the addition (determined in fma1.sv)
-	assign ProdMan2 = KillProdM ? 106'b0 : ProdManM;
-
-	// Do the addition
-	// 		- add one to negate if the added was inverted
-	//		- the 2 extra bits at the begining and end are needed for rounding
-	assign SumTmp = AlignedAddend2 + {55'b0, ProdMan2,2'b0} + {162'b0, InvZ};
-	 
-	// Is the sum negitive
-	assign NegSum = SumTmp[162];
-	// If the sum is negitive, negate the sum.
-	assign Sum = NegSum ? -SumTmp[161:0] : SumTmp[161:0];
-
-
-
-
-
-
-	// Leading one detector
-	logic [8:0]	i;
-	always_comb begin
-			i = 0;
-			while (~Sum[161-i] && $unsigned(i) <= $unsigned(9'd161)) i = i+1;  // search for leading one 
-			NormCnt = i+1;    // compute shift count
-	end
-
-
-
-
-
-
-
-
-
-
-
-	// Normalization
-
-
-	// Determine if the sum is zero
-	assign SumZero = ~(|Sum);
-
-	logic [12:0] ManLen;
-	assign ManLen = FmtM ? 13'd52 : 13'd23;
-	// Determine if the result is denormal
-	assign ResultDenorm = $signed(SumExpTmp)<=0 & ($signed(SumExpTmp)>=$signed(-ManLen));
-
-	// Determine the shift needed for denormal results
-	assign SumExpTmpMinus1 = SumExpTmp-1;
-	assign DenormShift = ResultDenorm ? SumExpTmpMinus1[8:0] : 9'b0;
-
-	// Normalize the sum
-	assign NormSumTmp = SumZero ? 162'b0 : Sum << NormCnt+DenormShift; 
-	assign NormSum = NormSumTmp[161:108];
-	// Calculate the sticky bit
-	assign NormSumSticky = FmtM ? (|NormSumTmp[107:0]) : (|NormSumTmp[136:0]);
-	assign Sticky = AddendStickyM | NormSumSticky;
-
-	// Determine sum's exponent
-	assign SumExpTmp = KillProdM ? {2'b0, ZExp} : ProdExpM + -({4'b0, NormCnt} - 13'd56);
-	assign SumExp = SumZero ? 13'b0 : 
-				 ResultDenorm ? 13'b0 :
-				 SumExpTmp; 
-
-
-
-
-
-	// Rounding
-
-	// round to nearest even
-	//		{Gaurd, Round, Sticky}
-	//		0xx - do nothing
-	//		100 - tie - Plus1 if NormSum[2] = 1
-	//			- don't add 1 if there was supposed to be a subtraction by a small number that didn't happen
-	//		101/110/111 - Plus1
-
-	// 	round to zero - do nothing
-	//			- subtract 1 if a small number was supposed to be subtracted from the positive result
-
-	// 	round to -infinity - Plus1 if negitive
-	//			- don't add 1 if there was supposed to be a subtraction by a small number that didn't happen
-	//			- subtract 1 if a small number was supposed to be subtracted from the positive result
-
-	// 	round to infinity - Plus1 if positive
-
-	//			- don't add 1 if there was supposed to be a subtraction by a small number that didn't happen
-	//			- subtract 1 if a small number was supposed to be subtracted from the negitive result
-
-	//  round to nearest max magnitude
-	//		{Gaurd, Round, Sticky}
-	//		0xx - do nothing
-	//		100 - tie - Plus1
-	//			- don't add 1 if there was supposed to be a subtraction by a small number that didn't happen
-	//		101/110/111 - Plus1
-
-	// Deterimine if the result was supposed to be subtrated by a small number
-	logic Gaurd, Round;
-	assign Gaurd = FmtM ? NormSum[1] : NormSum[30];
-	assign Round = FmtM ? NormSum[0] : NormSum[29];
-	assign SubBySmallNum = AddendStickyM&InvZ&~NormSumSticky;
-
-	always_comb begin
-		// Determine if you add 1
-		case (FrmM)
-			3'b000: Plus1Tmp = Gaurd & (Round | (Sticky&~(~Round&SubBySmallNum)) | (~Round&~Sticky&NormSum[2]));//round to nearest even
-			3'b001: Plus1Tmp = 0;//round to zero
-			3'b010: Plus1Tmp = WSgn & ~(SubBySmallNum);//round down
-			3'b011: Plus1Tmp = ~WSgn & ~(SubBySmallNum);//round up
-			3'b100: Plus1Tmp = (Gaurd & (Round | (Sticky&~(~Round&SubBySmallNum)) | (~Round&~Sticky)));//round to nearest max magnitude
-			default: Plus1Tmp = 1'bx;
-		endcase
-		// Determine if you subtract 1
-		case (FrmM)
-			3'b000: Minus1Tmp = 0;//round to nearest even
-			3'b001: Minus1Tmp = SubBySmallNum;//round to zero
-			3'b010: Minus1Tmp = ~WSgn & SubBySmallNum;//round down
-			3'b011: Minus1Tmp = WSgn & SubBySmallNum;//round up
-			3'b100: Minus1Tmp = 0;//round to nearest max magnitude
-			default: Minus1Tmp = 1'bx;
-		endcase
-	
-	end
-
-	// If an answer is exact don't round
-    assign Plus1 = Sticky | (Gaurd|Round) ? Plus1Tmp : 1'b0;
-    assign Minus1 = Sticky | (Gaurd|Round) ? Minus1Tmp : 1'b0;
-	// Compute rounded result 
-    assign {WExpTmp, WMan} = FmtM ? {SumExp, NormSum[53:2]} - {64'b0, Minus1} + {64'b0, Plus1} : {{SumExp, NormSum[53:31]} - {35'b0, Minus1} + {35'b0, Plus1}, 28'b0};
-    assign WExp = WExpTmp[10:0];
-
-
-
-
-
-
-
-	// Sign calculation
-
-
-	// Determine the sign if the sum is zero
-	//	if product underflows then use psign
-	//	otherwise
-	//		if cancelation then 0 unless round to -inf
-	//		otherwise psign
-	assign ZeroSgn = Underflow & ~ResultDenorm ? PSgn :
-				  (PSgn^ZSgn ? FrmM == 3'b010 : PSgn);
-
-	// is the result negitive
-	// 	if p - z is the Sum negitive
-	// 	if -p + z is the Sum positive
-	// 	if -p - z then the Sum is negitive
-	assign ResultSgn = InvZ&(ZSgn)&NegSum | InvZ&PSgn&~NegSum | ((ZSgn)&PSgn);
-	assign WSgn = SumZero ? ZeroSgn : ResultSgn;
- 
-	// Select the result
-	assign FmaResultM = XNaNM ? (FmtM ? {XSgn, FInput1M[62:52], 1'b1,FInput1M[50:0]} : {XSgn, FInput1M[62:55], 1'b1,FInput1M[53:0]}) : 
-						YNaNM ? (FmtM ? {YSgn, FInput2M[62:52], 1'b1,FInput2M[50:0]} : {YSgn, FInput2M[62:55], 1'b1,FInput2M[53:0]}) : 
-						ZNaNM ? (FmtM ? {ZSgn, FInput3M2[62:52], 1'b1,FInput3M2[50:0]} : {ZSgn, FInput3M2[62:55], 1'b1,FInput3M2[53:0]}) :
-						Invalid ? (FmtM ? {WSgn, 11'h7ff, 1'b1, 51'b0} : {WSgn, 8'h7f8, 1'b1, 54'b0}) : // has to be before inf
-						XInfM ? {PSgn, FInput1M[62:0]} :
-						YInfM ? {PSgn, FInput2M[62:0]} :
-						ZInfM ? {ZSgn, FInput3M2[62:0]} :
-						Overflow ? (FmtM ? {WSgn, 11'h7ff, 52'b0} : {WSgn, 8'h7f8, 55'b0}) :
-						Underflow & ~ResultDenorm ? (FmtM ? {WSgn, 63'b0} - {63'b0, (Minus1&AddendStickyM)} + {63'b0, (Plus1&AddendStickyM)} : {{WSgn, 31'b0} - {31'b0, (Minus1&AddendStickyM)} + {31'b0, (Plus1&AddendStickyM)}, 32'b0}) : //***do you need minus1?
-						KillProdM ? (FmtM ? FInput3M2 - {63'b0, (Minus1&AddendStickyM)} + {63'b0, (Plus1&AddendStickyM)} : {FInput3M2[63:32] - {31'b0, (Minus1&AddendStickyM)} + {31'b0, (Plus1&AddendStickyM)}, 32'b0}) : // has to be after Underflow
-						FmtM ? {WSgn,WExp,WMan} : {WSgn,WExp[6:0],WMan,4'b0};
-logic [63:0] tmp;
-	assign tmp = {WSgn,WExp[6:0],WMan,4'b0};
-
-	// Set Invalid flag for following cases:
-	//   1) Inf - Inf
-	//   2) 0 * Inf
-	//   3) any input is a signaling NaN
-	logic [12:0] MaxExp;
-	assign MaxExp = FmtM ? 13'd2047 : 13'd255;
-	assign ProdOf = (ProdExpM >= MaxExp && ~ProdExpM[12]);
-	assign ProdInf = ProdOf && ~XNaNM && ~YNaNM;
-	assign SigNaN = FmtM ? (XNaNM&~FInput1M[51]) | (YNaNM&~FInput2M[51]) | (ZNaNM&~FInput3M2[51]) : (XNaNM&~FInput1M[54]) | (YNaNM&~FInput2M[54]) | (ZNaNM&~FInput3M2[54]);
-	assign Invalid = SigNaN | ((XInfM || YInfM || ProdInf) & ZInfM & (XSgn ^ YSgn ^ ZSgn)) | (XZeroM & YInfM) | (YZeroM & XInfM);  
-	
-	// Set Overflow flag if the number is too big to be represented
-	assign Overflow = WExpTmp >= MaxExp & ~WExpTmp[12];
-
-	// Set Underflow flag if the number is too small to be represented in normal numbers
-	assign ProdUf = KillProdM & ZZeroM;
-	assign Underflow = SumExp[12] | ProdUf;
-
-	// Set Inexact flag if the result is diffrent from what would be outputed given infinite precision
-	assign Inexact = (Sticky|Overflow| (Gaurd|Round))&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM);
-
-	// Combine flags 
-	//		- FMA can't set the Divide by zero flag
-	//		- Don't set the underflow flag if the result is exact 
-	assign FmaFlagsM = {Invalid, 1'b0, Overflow, Underflow & Inexact, Inexact};
-
-endmodule
-
--- a/wally-pipelined/src/fpu/FMA/lza.sv
+++ b/wally-pipelined/src/fpu/FMA/lza.sv
@ -1,40 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	lop.v
-// Author:		David Harris
-// Date:		11/2/1995
-//
-// Block Description:
-//   This block implements a Leading One Predictor used to determine 
-//   the normalization shift count. 
-///////////////////////////////////////////////////////////////////////////////
-
-///////////////////////////////////////////////////////////////////////////// 
-module lza(sum, normcnt, sumzero); 
-/////////////////////////////////////////////////////////////////////////////
- 
-	input logic     	[163:0]  	sum;            // sum
-	output logic     	[8:0]		normcnt;		// normalization shift count
-	output logic     		  		sumzero;		// sum = 0
-
-	// Internal nodes
-
-	reg			[8:0] 		i;				// loop index
- 
-	// A real LOP uses a fast carry chain to find only the first 0.
-	// It is an example of a parallel prefix algorithm.  For the sake
-	// of simplicity,  this model is behavioral instead.
-	// A real LOP would also operate on the sources of the adder, not
-	// the result!
-
-	always_comb
-		begin
-			i =   0;
-			while (~sum[163-i] && i <= 163) i = i+1;  // search for leading one 
-			normcnt = i;    // compute shift count
-	end
-
-	// Also check if sum is zero 
-	assign sumzero = ~(|sum);
-	
-endmodule
-
--- a/wally-pipelined/src/fpu/FMA/multiply.sv
+++ b/wally-pipelined/src/fpu/FMA/multiply.sv
@ -1,136 +0,0 @@
-
-module multiply(xman, yman, xdenormE, ydenormE, xzeroE, yzeroE, rE, sE); 
-/////////////////////////////////////////////////////////////////////////////
-
-	input logic 		[51:0]		xman;				// Fraction of multiplicand	x
-	input logic		[51:0]		yman;				// Fraction of multiplicand y	
-	input logic					xdenormE;		// is x denormalized	
-	input logic					ydenormE;		// is y denormalized	
-	input logic     			xzeroE;		// Z is denorm
-	input logic     			yzeroE;		// Z is denorm
-	output logic		[105:0]		rE;				//	partial product 1	
-	output logic		[105:0]		sE;				//	partial product 2	
-    
-     wire        [54:0]      yExt; //y with appended 0 and assumed 1
-     wire        [53:0]      xExt; //y with assumed 1
-     wire [26:0][1:0] add1;
-     wire [26:0][54:0] pp; 
-     wire [26:0] e;
-     logic [106:0] tmpsE;
-     logic [17:0][106:0] lv1add;
-     logic [11:0][106:0] lv2add;
-     logic [7:0][106:0] lv3add;
-     logic [3:0][106:0] lv4add;
-     logic [21:0][107:0] carryTmp;
-     wire [26:0][106:0] acc; 
-     // wire [105:0] acc
-    genvar i;	
-
-	assign xExt = {1'b0,~(xdenormE|xzeroE),xman};
-	assign yExt = {1'b0,~(ydenormE|yzeroE),yman, 1'b0};
-    
-     generate
-        for(i=0; i<27; i=i+1) begin
-            booth booth(.xExt(xExt), .choose(yExt[(i*2)+2:i*2]), .add1(add1[i]), .e(e[i]), .pp(pp[i]));
-        end
-     endgenerate
-
-    assign acc[0] = {49'b0,~e[0],e[0],e[0],pp[0]}; 
-    assign acc[1] = {49'b01,~e[1],pp[1],add1[0]}; 
-    assign acc[2] = {47'b01,~e[2],pp[2],add1[1], 2'b0};
-    assign acc[3] = {45'b01,~e[3],pp[3],add1[2], 4'b0};
-    assign acc[4] = {43'b01,~e[4],pp[4],add1[3], 6'b0};
-    assign acc[5] = {41'b01,~e[5],pp[5],add1[4], 8'b0};
-    assign acc[6] = {39'b01,~e[6],pp[6],add1[5], 10'b0};
-    assign acc[7] = {37'b01,~e[7],pp[7],add1[6], 12'b0};
-    assign acc[8] = {35'b01,~e[8],pp[8],add1[7], 14'b0};
-    assign acc[9] = {33'b01,~e[9],pp[9],add1[8], 16'b0};
-    assign acc[10] = {31'b01,~e[10],pp[10],add1[9], 18'b0};
-    assign acc[11] = {29'b01,~e[11],pp[11],add1[10], 20'b0};
-    assign acc[12] = {27'b01,~e[12],pp[12],add1[11], 22'b0};
-    assign acc[13] = {25'b01,~e[13],pp[13],add1[12], 24'b0};
-    assign acc[14] = {23'b01,~e[14],pp[14],add1[13], 26'b0};
-    assign acc[15] = {21'b01,~e[15],pp[15],add1[14], 28'b0};
-    assign acc[16] = {19'b01,~e[16],pp[16],add1[15], 30'b0};
-    assign acc[17] = {17'b01,~e[17],pp[17],add1[16], 32'b0};
-    assign acc[18] = {15'b01,~e[18],pp[18],add1[17], 34'b0};
-    assign acc[19] = {13'b01,~e[19],pp[19],add1[18], 36'b0};
-    assign acc[20] = {11'b01,~e[20],pp[20],add1[19], 38'b0};
-    assign acc[21] = {9'b01,~e[21],pp[21],add1[20], 40'b0};
-    assign acc[22] = {7'b01,~e[22],pp[22],add1[21], 42'b0};
-    assign acc[23] = {5'b01,~e[23],pp[23],add1[22], 44'b0};
-    assign acc[24] = {3'b01,~e[24],pp[24],add1[23], 46'b0};
-    assign acc[25] = {1'b0, ~e[25],pp[25],add1[24], 48'b0};
-    assign acc[26] = {pp[26],add1[25], 50'b0};
-
-    //*** resize adders
-     generate
-        for(i=0; i<9; i=i+1) begin
-            add3comp2 #(.BITS(107)) add1(.a(acc[i*3]), .b(acc[i*3+1]), .c(acc[i*3+2]), 
-                                           .carry(carryTmp[i][106:0]), .sum(lv1add[i*2+1]));
-            assign lv1add[i*2] = {carryTmp[i][105:0], 1'b0};
-        end
-     endgenerate
-
-     generate
-        for(i=0; i<6; i=i+1) begin
-            add3comp2 #(.BITS(107)) add2(.a(lv1add[i*3]), .b(lv1add[i*3+1]), .c(lv1add[i*3+2]), 
-                                           .carry(carryTmp[i+9][106:0]), .sum(lv2add[i*2+1]));
-            assign lv2add[i*2] = {carryTmp[i+9][105:0], 1'b0};
-        end
-     endgenerate
-
-    generate
-        for(i=0; i<4; i=i+1) begin
-            add3comp2 #(.BITS(107)) add3(.a(lv2add[i*3]), .b(lv2add[i*3+1]), .c(lv2add[i*3+2]), 
-                                            .carry(carryTmp[i+15][106:0]), .sum(lv3add[i*2+1]));
-            assign lv3add[i*2] = {carryTmp[i+15][105:0], 1'b0};
-        end
-    endgenerate
-
-
-    generate
-        for(i=0; i<2; i=i+1) begin
-            add4comp2 #(.BITS(107)) add4(.a(lv3add[i*4]), .b(lv3add[i*4+1]), .c(lv3add[i*4+2]), .d(lv3add[i*4+3]),
-                                            .carry(carryTmp[i+19]), .sum(lv4add[i*2+1]));
-            assign lv4add[i*2] = {carryTmp[i+19][105:0], 1'b0};
-        end
-    endgenerate
-
-    add4comp2 #(.BITS(107)) add5(.a(lv4add[0]), .b(lv4add[1]), .c(lv4add[2]), .d(lv4add[3]) ,
-                                    .carry(carryTmp[21]), .sum(tmpsE));
-    assign sE = tmpsE[105:0];
-    assign rE = {carryTmp[21][104:0], 1'b0};
-		// assign rE = 0;
-		// assign sE = acc[0] +
-		// 		   acc[1] +
-		// 		   acc[2] +
-		// 		   acc[3] +
-		// 		   acc[4] +
-		// 		   acc[5] +
-		// 		   acc[6] +
-		// 		   acc[7] +
-		// 		   acc[8] +
-		// 		   acc[9] +
-		// 		   acc[10] +
-		// 		   acc[11] +
-		// 		   acc[12] +
-		// 		   acc[13] +
-		// 		   acc[14] +
-		// 		   acc[15] +
-		// 		   acc[16] +
-		// 		   acc[17] +
-		// 		   acc[18] +
-		// 		   acc[19] +
-		// 		   acc[20] +
-		// 		   acc[21] +
-		// 		   acc[22] +
-		// 		   acc[23] +
-		// 		   acc[24] +
-		// 		   acc[25] +
-		// 		   acc[26];
-
-			// assign sE = {53'b0,~(xdenormE|xzeroE),xman}  *  {53'b0,~(ydenormE|yzeroE),yman};
-			// assign rE = 0;
-endmodule
-
--- a/wally-pipelined/src/fpu/FMA/normalize.sv
+++ b/wally-pipelined/src/fpu/FMA/normalize.sv
@ -1,147 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	normalize.v
-// Author:		David Harris
-// Date:		11/2/1995
-//
-// Block Description:
-//   This block performs the normalization shift.  It also
-//   generates the Rands bits for rounding.  Finally, it
-//   handles the special case of a zero sum.
-//
-//   v[53:2]  is the fraction component of the prerounded result.
-//   It can be bypassed back to the X or Z inputs of the FMAC
-//   for back-to-back operations. 
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module normalize(sum, zexp, normcnt, aeM, aligncntM, sumshiftM, sumshiftzeroM, sumzero, 
-				xzeroM, zzeroM, yzeroM, bsM, xdenormM, ydenormM, zdenormM, sticky, de0, resultdenorm, v); 
-/////////////////////////////////////////////////////////////////////////////
-	input logic     	[163:0]  	sum;            // sum
-	input logic     	[62:52]  	zexp;            // sum
-	input logic		[8:0] 		normcnt;     	// normalization shift count
-	input logic		[12:0] 		aeM;     	// normalization shift count
-	input logic		[12:0] 		aligncntM;     	// normalization shift count
-	input logic		[8:0] 		sumshiftM;     	// normalization shift count
-	input logic				sumshiftzeroM;
-	input logic				sumzero;	// sum is zero
-	input logic				bsM;		// sticky bit for addend
-	input logic                  		xdenormM;        // Input Z is denormalized
-	input logic                  		ydenormM;        // Input Z is denormalized
-	input logic                  		zdenormM;        // Input Z is denormalized
-	input logic				xzeroM;
-	input logic				yzeroM;
-	input logic				zzeroM;
-	output logic				sticky;		//sticky bit
-	output logic		[12:0]		de0;
-	output logic                  	resultdenorm;        // Input Z is denormalized
-	output logic		[53:0]		v;		// normalized sum, R, S bits
-
-	// Internal nodes
-
-logic       	[163:0]  	sumshifted;     // shifted sum
-	logic		[9:0]		sumshifttmp;
-	logic       	[163:0]  	sumshiftedtmp;     // shifted sum
-	logic				isShiftLeft1;
-logic tmp,tmp1,tmp2,tmp3,tmp4, tmp5;
-
-	// When the sum is zero,  normalization does not apply and only the
-	// sticky bit must be computed.  Otherwise,  the sum is right-shifted
-	// and the Rand S bits (v[1]  and v[O],  respectively) are assigned.
-
-	// The R bit is also set on denormalized numbers where the exponent
-	// was computed to be exactly -1023 and the L bit was set.  This
-	// is required for correct rounding up of multiplication results.
-
-	// The sticky bit calculation is actually built into the shifter and
-	// does not require a true subtraction shown in the model.
- 
-	assign isShiftLeft1 = (aligncntM == 13'b1 ||aligncntM == 13'b0 || $signed(aligncntM) == $signed(-(13'b1)))&& zexp == 11'h2;
-	// assign tmp = ($signed(aeM-normcnt+2) >= $signed(-1022));
-	always_comb
-		begin
-		// d = aligncntM
-		// l = normcnt
-		// p = 53
-		// ea + eb = aeM
-			// set d<=2 to d<=0
-			if ($signed(aligncntM)<=$signed(13'd2))  begin //d<=2 
-				// product anchored or cancellation
-				if ($signed(aeM-{{4{normcnt[8]}},normcnt}+13'd2) >= $signed(-(13'd1022))) begin //ea+eb-l+2 >= emin
-					//normal result
-					de0 = xzeroM|yzeroM ? {2'b0,zexp} : aeM-{{4{normcnt[8]}},normcnt}+{12'b0,xdenormM}+{12'b0,ydenormM}+13'd57;
-					resultdenorm = |sum & ~|de0 | de0[12];
-					// if z is zero then there was a 56 bit shift of the product
-					sumshifted = resultdenorm ? sum << sumshiftM-{8'b0,zzeroM}+{8'b0,isShiftLeft1} : sum << normcnt; // p+2+l
-					v = sumshifted[162:109];
-					sticky = (|sumshifted[108:0]) | bsM;
-					//de0 = aeM-normcnt+2-1023;
-				end else begin
-					sumshifted = sum << (13'd1080+aeM);
-					v = sumshifted[162:109];
-					sticky = (|sumshifted[108:0]) | bsM;
-					resultdenorm = 1;
-					de0 = 0;
-				end
-
-			end else begin                 // extract normalized bits
-				sumshifttmp = {1'b0,sumshiftM} - 2;
-				sumshifted = sumshifttmp[9] ? sum : sum << sumshifttmp;
-				tmp1 = (sumshifted[163] & ~sumshifttmp[9]);
-				tmp2 = ((sumshifttmp[9] & sumshiftM[0]) || sumshifted[162]);
-				tmp3 = (sumshifted[161] || (sumshifttmp[9] & sumshiftM[1]));
-				tmp4 = sumshifted[160];
-				tmp5 = sumshifted[159];
-				// for some reason use exp = zexp + {0,1,2}
-				// the book says exp = zexp + {-1,0,1}
-				if(sumshiftzeroM) begin
-					v = sum[162:109];
-					sticky = (|sum[108:0]) | bsM;
-					de0 = {2'b0,zexp};
-				end else if(sumshifted[163] & ~sumshifttmp[9])begin
-					v = sumshifted[162:109];
-					sticky = (|sumshifted[108:0]) | bsM;
-					de0 = {2'b0,zexp} +13'd2;
-				end else if ((sumshifttmp[9] & sumshiftM[0]) || sumshifted[162]) begin
-					v = sumshifted[161:108];
-					sticky = (|sumshifted[107:0]) | bsM;
-					de0 = {2'b0,zexp}+13'd1;
-				end else if (sumshifted[161] || (sumshifttmp[9] & sumshiftM[1])) begin
-					v = sumshifted[160:107];
-					sticky = (|sumshifted[106:0]) | bsM;
-					//de0 = zexp-1;
-					de0 = {2'b0,zexp}+{12'b0,zdenormM};
-				end else if(sumshifted[160]& ~zdenormM) begin
-					de0 = {2'b0,zexp}-13'b1;
-					v = ~|de0&~sumzero ? sumshifted[160:107] : sumshifted[159:106];
-					sticky = (|sumshifted[105:0]) | bsM;
-					//de0 = zexp-1;
-				end else if(sumshifted[159]& ~zdenormM) begin
-					//v = sumshifted[158:105];
-					de0 = {2'b0,zexp}-13'd2;
-					v = (~|de0 | de0[12])&~sumzero ? sumshifted[161:108] : sumshifted[158:105];
-					sticky = (|sumshifted[104:0]) | bsM;
-					//de0 = zexp-1;
-				end else if(zdenormM) begin					
-					v = sumshifted[160:107];
-					sticky = (|sumshifted[106:0]) | bsM;
-					//de0 = zexp-1;
-					de0 = {{2{zexp[62]}},zexp};
-				end else begin
-					de0 = 0;
-					sumshifted = sum << sumshiftM-1; // p+2+l
-					v = sumshifted[162:109];
-					sticky = (|sumshifted[108:0]) | bsM;
-				end
-
-				resultdenorm = (~|de0 | de0[12]);
-		end 
-	end
-
-
-	// shift sum left by normcnt,  filling the right with zeros 
-	//assign sumshifted = sum << normcnt;
-	
-endmodule
-
-
--- a/wally-pipelined/src/fpu/FMA/round.sv
+++ b/wally-pipelined/src/fpu/FMA/round.sv
@ -1,124 +0,0 @@
-///////////////////////////////////////////////////////////////////////////// 
-// Block Name:	round.v
-// Author:		David Harris
-// Date:		11/2/1995
-//
-// Block Description: 
-//   This block is responsible for rounding the normalized result of //   the FMAC.   Because prenormalized results may be bypassed back to //   the FMAC X and z input logics, rounding does not appear in the critical //   path of most floating point code.   This is good because rounding //   requires an entire 52 bit carry-propagate half-adder delay.
-//
-//   The results from other FPU blocks (e.g. FCVT,  FDIV,  etc)  are also 
-//   muxed in to form the actual result for register file writeback.  This
-//   saves a mux from the writeback path.
-//
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module round(v, sticky, FrmM, wsign,
-			  FmaFlagsM, inf, nanM, xnanM, ynanM, znanM, 
-			  xman, yman, zman,
-			  wman, infinity, specialsel,expplus1);
-/////////////////////////////////////////////////////////////////////////////
-
-	input logic		[53:0]		v;		// normalized sum, R, S bits
-	input logic				sticky;		//sticky bit
-	input logic		[2:0]	FrmM;
-	input logic				wsign;		// Sign of result
-	input logic 		[4:0]	FmaFlagsM;
-	input logic				inf;		// Some input logic is infinity
-	input logic				nanM;		// Some input logic is NaN
-	input logic				xnanM;		// X is NaN
-	input logic				ynanM;		// Y is NaN
-	input logic				znanM;		// Z is NaN
-	input logic		[51:0]		xman;		// input logic X
-	input logic		[51:0]		yman;		// input logic Y
-	input logic		[51:0]		zman;		// input logic Z
-	output logic		[51:0]		wman; 		// rounded result of FMAC
-	output logic				infinity;    	// Generate infinity on overflow
-	output logic				specialsel;  	// Select special result
-	output logic				expplus1;
-
-	// Internal nodes
-
-	logic				plus1;		// Round by adding one 
-	wire		[52:0]		v1;		// Result + 1 (for rounding)
-	wire		[51:0]		specialres;	// Result of exceptional case 
-	wire		[51:0]		infinityres;	// Infinity or largest real number
-	wire		[51:0]		nanres;		// Propagated or generated NaN 
-
-	// Compute if round should occur.  This equation is derived from
-	// the rounding tables.
-
-	// round to infinity - plus1 if positive
-	// round to -infinity - plus1 if negitive
-	// round to zero - do nothing
-	// round to nearest even
-	//	{v[1], v[0], sticky}
-	//	0xx - do nothing
-	//	100 - tie - plus1 if v[2] = 1
-	//	101/110/111 - plus1
-	always_comb begin
-		case (FrmM)
-			3'b000: plus1 = (v[1] & (v[0] | sticky | (~v[0]&~sticky&v[2])));//round to nearest even
-			3'b001: plus1 = 0;//round to zero
-			3'b010: plus1 = wsign;//round down
-			3'b011: plus1 = ~wsign;//round up
-			3'b100: plus1 = (v[1] & (v[0] | sticky | (~v[0]&~sticky&~wsign)));//round to nearest max magnitude
-			default: plus1 = 1'bx;
-		endcase
-	end
-	// assign plus1 = (rn & v[1] & (v[0] | sticky | (~v[0]&~sticky&v[2]))) |
-	// 	       (rp & ~wsign) |
-	// 	       (rm & wsign);
-	//assign plus1 = rn && ((v[1] && v[0]) || (v[2] && (v[1]))) ||
-	//				 rp && ~wsign && (v[1] || v[0]) ||
-	//				 rm && wsign && (v[1] || v[0]);
-
-	// Compute rounded result 
-    assign v1 = v[53:2] + 1;
-	// Determine if postnormalization is necessary
-	// Predicted by all bits =1 before round +1
-
-	//assign postnormalize = &(v[53:2]) && plus1;
-
-	// Determine special result in event of of selection of a result from
-	// another FPU functional unit,  infinity, NAN,  or underflow
-	// The special result mux is a 4:1 mux that should not appear in the
-	// critical path of the machine.   It is not priority encoded,  despite
-	// the code below suggesting otherwise.  Also,  several of the identical data
-	// input logics to the wide muxes can be combined at the expense of more
-	// complicated non-critical control in the circuit implementation.
-
-	assign specialsel =  FmaFlagsM[2] ||  FmaFlagsM[1] ||  FmaFlagsM[4] || //overflow underflow invalid
-							nanM || inf;
-	assign specialres = FmaFlagsM[4] | nanM ? nanres : //invalid
-						 FmaFlagsM[2] ? infinityres : //overflow
-						 inf ? 52'b0 :
-						 FmaFlagsM[1] ? 52'b0 : 52'bx;  // underflow
-
-	// Overflow is handled differently for different rounding modes
-	// Round is to either infinity or to maximum finite number
-
-	assign infinity =  |FrmM;//rn || (rp && ~wsign) || (rm && wsign);//***look into this
-	assign infinityres = infinity ? 52'b0 : {52{1'b1}};
-
-	// Invalid operations produce a quiet NaN. The result should
-	// propagate an input logic if the input logic is NaN. Since we assume all
-	// NaN input logics are already quiet, we don't have to force them quiet.
-
-	// assign nanres = xnanM ? x: (ynanM ? y : (znanM ? z : {1'b1, 51'b0})); // original
-
-	// IEEE 754-2008 section 6.2.3 states:
-	// "If two or more input logics are NaN, then the payload of the resulting NaN should be 
-	// identical to the payload of one of the input logic NaNs if representable in the destination
-	// format. This standard does not specify which of the input logic NaNs will provide the payload."
-	assign nanres = xnanM ? {1'b1, xman[50:0]}: (ynanM ? {1'b1, yman[50:0]} : (znanM ? {1'b1, zman[50:0]} : {1'b1, 51'b0}));// KEP 210112 add the 1 to make NaNs quiet
-
-	// Select result with 4:1 mux
-	// If the sum is zero and we round up,  there is a special case in
-	// which we produce a massive loss of significance and trap to software.
-	// It is handled in the exception unit. 
-	assign expplus1 = v1[52] & ~specialsel & plus1;
-	assign wman = specialsel ? specialres : (plus1 ? v1[51:0] : v[53:2]);
-	
-endmodule
-
--- a/wally-pipelined/src/fpu/FMA/sign.sv
+++ b/wally-pipelined/src/fpu/FMA/sign.sv
@ -1,111 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	sign.v
-// Author:		David Harris
-// Date:		12/1/1995
-//
-// Block Description:
-//   This block manages the signs of the numbers.
-//   1 =  negative
-//
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module sign(xsign, ysign, zsign, negsum0, negsum1, bsM, FrmM, FmaFlagsM, 
-			 sumzero, zinfM, inf, wsign, invz, negsum, selsum1, isAdd);
-////////////////////////////////////////////////////////////////////////////I
- 
-	input logic					xsign;			// Sign of X 
-	input logic					ysign;			// Sign of Y 
-	input logic					zsign;			// Sign of Z
-	input logic					isAdd;
-	input logic					negsum0;		// Sum in +O mode is negative 
-	input logic					negsum1;		// Sum in +1 mode is negative 
-	input logic					bsM;				// sticky bit from addend
-	input logic		[2:0]		FrmM;				// Round toward minus infinity
-	input logic		[4:0]		FmaFlagsM;				// Round toward minus infinity
-	input logic					sumzero;		// Sum = O
-	input logic					zinfM;			// Y = Inf
-	input logic					inf;			// Some input logic = Inf
-	output logic					wsign;			// Sign of W 
-	output logic					invz;			// Invert addend into adder
-	output logic					negsum;			// Negate result of adder
-	output logic					selsum1;		// Select +1 mode from compound adder
- 
-	// Internal nodes
-
-	wire					zerosign;    	// sign if result= 0 
-	wire					sumneg;    	// sign if result= 0 
-	wire					infsign;     	// sign if result= Inf 
-logic tmp;
-
-	// Compute sign of product 
-
-	assign psign = xsign ^ ysign;
-
-	// Invert addend if sign of Z is different from sign of product assign invz = zsign ^ psign;
-
-	//do you invert z
-	assign invz = (zsign ^ psign);
-
-	assign selsum1 = invz;
-	//negate sum if its negitive
-	assign negsum = (selsum1&negsum1) | (~selsum1&negsum0);
-	// is the sum negitive
-	// 	if p - z is the sum negitive
-	// 	if -p + z is the sum positive
-	// 	if -p - z then the sum is negitive
-	assign sumneg = invz&zsign&negsum1 | invz&psign&~negsum1 | (zsign&psign);
-	//always @(invz or negsum0 or negsum1 or bsM or ps)
-	//	begin
-	//		if (~invz) begin               // both input logics have same sign  
-	//			negsum = 0;
-	//			selsum1 = 0;
-	//		end else if (bsM) begin        // sticky bit set on addend
-	//			selsum1 = 0;
-	//			negsum = negsum0; 
-	//		end else if (ps) begin 		// sticky bit set on product
-	//			selsum1 = 1;
-	//			negsum =  negsum1;
-	//		end else begin 				// both sticky bits clear
-	//			//selsum1 = negsum1; 	// KEP 210113-10:44 Selsum1 was adding 1 to values that were multiplied by 0
-	//			 selsum1 = ~negsum1; //original
-	//			negsum = negsum1;
-	//	end 
-	//end
-
-	// Compute sign of result
-	// This involves a special case when the sum is zero:
-	//   x+x retains the same sign as x even when x = +/- 0.
-	//   otherwise,  x-x = +O unless in the RM mode when x-x = -0
-	// There is also a special case for NaNs and invalid results;
-	// the sign of the NaN produced is forced to be 0.
-	// Sign calculation is not in the critical path so the cases
-	// can be tolerated. 
-	// IEEE 754-2008 section 6.3 states 
-	// 		"When ether an input logic or result is NaN, this standard does not interpret the sign of a NaN."
-	// 		also pertaining to negZero it states:
-	//			"When the sum/difference of two operands with opposite signs is exactly zero, the sign of that sum/difference
-	//			 shall be +0 in all rounding attributes EXCEPT roundTowardNegative. Under that attribute, the sign of an exact zero 
-	//			 sum/difference shall be -0.  However, x+x = x-(-X) retains the same sign as x even when x is zero."
- 
-	//assign zerosign = (~invz && killprodM) ? zsign : rm;//***look into
-//	assign zerosign = (~invz && killprodM) ? zsign : 0;
-	// zero sign
-	//	if product underflows then use psign
-	//	otherwise
-	//		addition
-	//			if cancelation then 0 unless round to -inf
-	//			otherwise psign
-	//		subtraction
-	//			if cancelation then 0 unless round to -inf
-	//			otherwise psign
-
-	assign zerosign = FmaFlagsM[1] ? psign :
-			  (isAdd ? (psign^zsign ? FrmM == 3'b010 : psign) :
-				  (psign^zsign ? psign : FrmM == 3'b010));
-	assign infsign = zinfM ? zsign : psign; //KEP 210112 keep the correct sign when result is infinity
-	//assign infsign = xinfM ? (yinfM ? psign : xsign) : yinfM ? ysign : zsign;//original
-	assign tmp = FmaFlagsM[4] ? 0 : (inf ? infsign :(sumzero ? zerosign : psign ^ negsum));
-	assign wsign = FmaFlagsM[4] ? 0 : (inf ? infsign :(sumzero ? zerosign : sumneg));
-
-endmodule
--- a/wally-pipelined/src/fpu/FMA/special.sv
+++ b/wally-pipelined/src/fpu/FMA/special.sv
@ -1,67 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	special.v
-// Author:		David Harris
-// Date:		12/2/1995
-//
-// Block Description:
-//   This block implements special case handling for unusual operands (e.g. 
-//   0, NaN,  denormalize,  infinity).   The block consists of zero/one detectors.
-//
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module special(ReadData1E, ReadData2E, ReadData3E, xzeroE, yzeroE, zzeroE,
-				xnanE, ynanE, znanE, xdenormE, ydenormE, zdenormE, xinfE, yinfE, zinfE);
-/////////////////////////////////////////////////////////////////////////////
-
-	input logic   	[63:0]     	ReadData1E;              // Input ReadData1E
-	input logic     	[63:0]     	ReadData2E;           	// Input ReadData2E
-	input logic      	[63:0]    	ReadData3E;            	// Input ReadData3E 
-	output logic				xzeroE;		// Input ReadData1E = 0
-	output logic				yzeroE;		// Input ReadData2E = 0
-	output logic				zzeroE;		// Input ReadData3E = 0
-	output logic				xnanE;		// ReadData1E is NaN
-	output logic				ynanE;		// ReadData2E is NaN
-	output logic				znanE;		// ReadData3E is NaN
-	output logic				xdenormE;	// ReadData1E is denormalized
-	output logic				ydenormE;	// ReadData2E is denormalized
-	output logic				zdenormE;	// ReadData3E is denormalized
-	output logic				xinfE;		// ReadData1E is infinity
-	output logic				yinfE;		// ReadData2E is infinity
-	output logic				zinfE;		// ReadData3E is infinity
-
-	// In the actual circuit design, the gates looking at bits
-	// 51:0 and at bits 62:52 should be shared among the various detectors.
-
-	// Check if input is NaN
-
-	assign xnanE = &ReadData1E[62:52] && |ReadData1E[51:0]; 
-	assign ynanE = &ReadData2E[62:52] && |ReadData2E[51:0]; 
-	assign znanE = &ReadData3E[62:52] && |ReadData3E[51:0];
-
-	// Check if input is denormalized
-
-	assign xdenormE = ~(|ReadData1E[62:52]) && |ReadData1E[51:0]; 
-	assign ydenormE = ~(|ReadData2E[62:52]) && |ReadData2E[51:0]; 
-	assign zdenormE = ~(|ReadData3E[62:52]) && |ReadData3E[51:0];
-
-	// Check if input is infinity
-
-	assign xinfE = &ReadData1E[62:52] && ~(|ReadData1E[51:0]); 
-	assign yinfE = &ReadData2E[62:52] && ~(|ReadData2E[51:0]); 
-	assign zinfE = &ReadData3E[62:52] && ~(|ReadData3E[51:0]);
-
-	// Check if inputs are all zero
-	// Also forces denormalized inputs to zero.
-	//   In the circuit implementation,  this can be optimized
-	// to just check if the exponent is zero.
-	
-	// KATHERINE - commented following (21/01/11)
-	// assign xzeroE = ~(|ReadData1E[62:0]) || xdenormE;
-	// assign yzeroE = ~(|ReadData2E[62:0]) || ydenormE;
-	// assign zzeroE = ~(|ReadData3E[62:0]) || zdenormE;
-	// KATHERINE - removed denorm to prevent output logicing zero when computing with a denormalized number
-	assign xzeroE = ~(|ReadData1E[62:0]);
-	assign yzeroE = ~(|ReadData2E[62:0]);
-	assign zzeroE = ~(|ReadData3E[62:0]);
- endmodule
--- a/wally-pipelined/src/fpu/FMA/tbgen/StineVectors
+++ b/wally-pipelined/src/fpu/FMA/tbgen/StineVectors
--- a/wally-pipelined/src/fpu/FMA/tbgen/testMini
+++ b/wally-pipelined/src/fpu/FMA/tbgen/testMini
--- a/wally-pipelined/src/fpu/convert_inputs.sv
+++ b/wally-pipelined/src/fpu/convert_inputs.sv
@ -29,23 +29,27 @@ module convert_inputs(Float1, Float2, op1, op2, op_type, P);

   // Test if the input exponent is zero, because if it is then the
   // exponent of the converted number should be zero. 
-   assign Zexp1 = ~(op1[62] | op1[61] | op1[60] | op1[59] | 
-		    op1[58] | op1[57] | op1[56] | op1[55]);
-   assign Zexp2 = ~(op2[62] | op2[61] | op2[60] | op2[59] | 
-		    op2[58] | op2[57] | op2[56] | op2[55]);
-   assign Oexp1 =  (op1[62] & op1[61] & op1[60] & op1[59] & 
-		    op1[58] & op1[57] & op1[56] & op1[55]);
-   assign Oexp2 =  (op2[62] & op2[61] & op2[60] & op2[59] & 
-		    op2[58] & op2[57] & op2[56] &op2[55]);
+   assign Zexp1 = ~(|op1[30:23]);
+   assign Zexp2 = ~(|op2[30:23]);
+   assign Oexp1 =  (&op1[30:23]);
+   assign Oexp2 =  (&op2[30:23]);
+   // assign Zexp1 = ~(op1[62] | op1[61] | op1[60] | op1[59] | 
+	// 	    op1[58] | op1[57] | op1[56] | op1[55]);
+   // assign Zexp2 = ~(op2[62] | op2[61] | op2[60] | op2[59] | 
+	// 	    op2[58] | op2[57] | op2[56] | op2[55]);
+   // assign Oexp1 =  (op1[62] & op1[61] & op1[60] & op1[59] & 
+	// 	    op1[58] & op1[57] & op1[56] & op1[55]);
+   // assign Oexp2 =  (op2[62] & op2[61] & op2[60] & op2[59] & 
+	// 	    op2[58] & op2[57] & op2[56] &op2[55]);

   // Conditionally convert op1. Lower 29 bits are zero for single precision.
-   assign Float1[62:29] = conv_SP ? {op1[62], {3{(~op1[62]&~Zexp1)|Oexp1}}, op1[61:32]}
+   assign Float1[62:29] = conv_SP ? {op1[30], {3{(~op1[30]&~Zexp1)|Oexp1}}, op1[29:0]}
 			  : op1[62:29];
   assign Float1[28:0] = op1[28:0] & {29{~conv_SP}};

   // Conditionally convert op2. Lower 29 bits are zero for single precision. 
-   assign Float2[62:29] = conv_SP ? {op2[62], 
-				     {3{(~op2[62]&~Zexp2)|Oexp2}}, op2[61:32]}
+   assign Float2[62:29] = conv_SP ? {op2[30], 
+				     {3{(~op2[30]&~Zexp2)|Oexp2}}, op2[29:0]}
 			  : op2[62:29];
   assign Float2[28:0] = op2[28:0] & {29{~conv_SP}};

@ -54,8 +58,8 @@ module convert_inputs(Float1, Float2, op1, op2, op_type, P);

   assign negate  = op_type[2] & ~op_type[1] & op_type[0];
   assign abs_val = op_type[2] & ~op_type[1] & ~op_type[0];
-   assign Float1[63]  = (op1[63] ^ negate) & ~abs_val;
-   assign Float2[63]  = op2[63];
+   assign Float1[63]  = conv_SP ? (op1[31] ^ negate) & ~abs_val : (op1[63] ^ negate) & ~abs_val;
+   assign Float2[63]  = conv_SP ? op2[31] : op2[63];

 endmodule // convert_inputs

--- a/wally-pipelined/src/fpu/convert_inputs_div.sv
+++ b/wally-pipelined/src/fpu/convert_inputs_div.sv
@ -3,22 +3,21 @@
 // it conditionally converts single precision values to double 
 // precision values and modifies the sign of op1. 
 // The converted operands are Float1 and Float2.
-
 module convert_inputs_div (Float1, Float2b, op1, op2, op_type, P);
   
-   input [63:0]  op1;           // 1st input operand (A)
-   input [63:0]  op2;           // 2nd input operand (B)
-   input 	 P;             // Result Precision (0 for double, 1 for single)
-   input 	 op_type;       // Operation   
+   input logic [63:0]  op1;           // 1st input operand (A)
+   input logic [63:0]  op2;           // 2nd input operand (B)
+   input logic 	       P;             // Result Precision (0 for double, 1 for single)
+   input logic 	       op_type;       // Operation   

-   output [63:0] Float1;	// Converted 1st input operand
-   output [63:0] Float2b;	// Converted 2nd input operand   
+   output logic [63:0] Float1;	      // Converted 1st input operand
+   output logic [63:0] Float2b;	      // Converted 2nd input operand   

-   wire [63:0] 	 Float2;   
-   wire 	 Zexp1;		// One if the exponent of op1 is zero
-   wire 	 Zexp2;		// One if the exponent of op2 is zero
-   wire 	 Oexp1;		// One if the exponent of op1 is all ones
-   wire 	 Oexp2;		// One if the exponent of op2 is all ones
+   logic [63:0]        Float2;   
+   logic 	       Zexp1;	      // One if the exponent of op1 is zero
+   logic 	       Zexp2;	      // One if the exponent of op2 is zero
+   logic 	       Oexp1;	      // One if the exponent of op1 is all ones
+   logic 	       Oexp2;	      // One if the exponent of op2 is all ones

   // Test if the input exponent is zero, because if it is then the
   // exponent of the converted number should be zero. 
--- a/wally-pipelined/src/fpu/divconv.sv
+++ b/wally-pipelined/src/fpu/divconv.sv
@ -1,11 +1,6 @@
-// `timescale 1ps/1ps
-module divconv (q1, qm1, qp1, q0, qm0, qp0, 
-		rega_out, regb_out, regc_out, regd_out,
-		regr_out, d, n, 
-		sel_muxa, sel_muxb, sel_muxr, 
-		reset, clk,
-		load_rega, load_regb, load_regc, load_regd,
-		load_regr, load_regs, P, op_type, exp_odd);
+module divconv (q1, qm1, qp1, q0, qm0, qp0, rega_out, regb_out, regc_out, regd_out,
+		regr_out, d, n, sel_muxa, sel_muxb, sel_muxr, reset, clk, load_rega, load_regb, 
+		load_regc, load_regd, load_regr, load_regs, P, op_type, exp_odd);

   input logic [52:0]   d, n;
   input logic [2:0] 	sel_muxa, sel_muxb;
@ -40,9 +35,7 @@ module divconv (q1, qm1, qp1, q0, qm0, qp0,
   logic [127:0] 	constant, constant2;
   logic [63:0] 	q_const, qp_const, qm_const;
   logic [63:0] 	d2, n2;   
-   logic [11:0] 	d3;  
-
-   logic cout1, cout2, cout3, cout4, cout5, cout6, cout7, muxr_out; 
+   logic [11:0] 	d3;   

   // Check if exponent is odd for sqrt
   // If exp_odd=1 and sqrt, then M/2 and use ia_addr=0 as IA
@ -68,9 +61,9 @@ module divconv (q1, qm1, qp1, q0, qm0, qp0,
   mux2 #(64) mx5 (muxb_out, mcand_q, sel_muxr&op_type, mplier);   
   mux2 #(64) mx6 (muxa_out, mcand_q, sel_muxr, mcand);
   // TDM multiplier (carry/save)
-   multiplier mult1 (mcand, mplier, Sum, Carry);   // ***multiply
+   multiplier mult1 (mcand, mplier, Sum, Carry);
   // Q*D - N (reversed but changed in rounder.v to account for sign reversal)
-   csa #(128) csa1 (Sum, Carry, constant, Sum2, Carry2); //***adder
+   csa #(128) csa1 (Sum, Carry, constant, Sum2, Carry2);
   // Add ulp for subtraction in remainder
   mux2 #(1) mx7 (1'b0, 1'b1, sel_muxr, muxr_out);

@ -80,15 +73,17 @@ module divconv (q1, qm1, qp1, q0, qm0, qp0,
   mux2 #(64) mxA ({64'hFFFF_FFFF_FFFF_F9FF}, {64'hFFFF_FF3F_FFFF_FFFF}, P, qm_const);
   
   // CPA (from CSA)/Remainder addition/subtraction
-   ldf128 cpa1 (cout1, mul_out, Sum2, Carry2, muxr_out); //***adder
+   adder #(128) cpa1 (Sum2, Carry2, muxr_out, mul_out, cout1);   
+   
   // Assuming [1,2) - q1
-   ldf64 cpa2 (cout2, q_out1, regb_out, q_const, 1'b0); //***adder
-   ldf64 cpa3 (cout3, qp_out1, regb_out, qp_const, 1'b0); //***adder
-   ldf64 cpa4 (cout4, qm_out1, regb_out, qm_const, 1'b1);    //***adder
-   // Assuming [0.5,1) - q0
-   ldf64 cpa5 (cout5, q_out0, {regb_out[62:0], vss}, q_const, 1'b0); //***adder
-   ldf64 cpa6 (cout6, qp_out0, {regb_out[62:0], vss}, qp_const, 1'b0); //***adder
-   ldf64 cpa7 (cout7, qm_out0, {regb_out[62:0], vss}, qm_const, 1'b1); //***adder
+   adder #(64) cpa2 (regb_out, q_const, 1'b0, q_out1, cout2);
+   adder #(64) cpa3 (regb_out, qp_const, 1'b0, qp_out1, cout3);
+   adder #(64) cpa4 (regb_out, qm_const, 1'b1, qm_out1, cout4);
+   // Assuming [0.5,1) - q0   
+   adder #(64) cpa5 ({regb_out[62:0], vss}, q_const, 1'b0, q_out0, cout5);
+   adder #(64) cpa6 ({regb_out[62:0], vss}, qp_const, 1'b0, qp_out0, cout6);
+   adder #(64) cpa7 ({regb_out[62:0], vss}, qm_const, 1'b1, qm_out0, cout7);    
+
   // One's complement instead of two's complement (for hw efficiency)
   assign three = {~mul_out[126], mul_out[126], ~mul_out[125:63]};   
   mux2 #(64) mxTC (~mul_out[126:63], three[64:1],  op_type, twocmp_out);
@ -112,9 +107,11 @@ endmodule // divconv

 // module adder #(parameter WIDTH=8)
 //    (input  logic [WIDTH-1:0] a, b,
-//     output logic [WIDTH-1:0] y);
+//     input logic 	     cin,
+//     output logic [WIDTH-1:0] y,
+//     output logic 	     cout);
   
-//    assign y = a + b;
+//    assign {cout, y} = a + b + cin;
   
 // endmodule // adder

@ -226,10 +223,33 @@ endmodule // divconv

 // endmodule // mux6

-// module eqcmp #(parameter WIDTH = 8)
-//    (input  logic [WIDTH-1:0] a, b,
-//     output logic             y);
+module eqcmp #(parameter WIDTH = 8)
+   (input  logic [WIDTH-1:0] a, b,
+    output logic             y);

-//    assign y = (a == b);
+   assign y = (a == b);
   
-// endmodule // eqcmp
+endmodule // eqcmp
+
+// module fa (input logic a, b, c, output logic sum, carry);
+
+//    assign sum = a^b^c;
+//    assign carry = a&b|a&c|b&c;   
+
+// endmodule // fa
+
+// module csa #(parameter WIDTH=8) 
+//    (input logic [WIDTH-1:0] a, b, c,
+//     output logic [WIDTH-1:0] sum, carry);
+
+//    logic [WIDTH:0] 	     carry_temp;   
+//    genvar 		     i;
+//    generate
+//       for (i=0;i<WIDTH;i=i+1)
+// 	begin : genbit
+// 	   fa fa_inst (a[i], b[i], c[i], sum[i], carry_temp[i+1]);
+// 	end
+//    endgenerate
+//    assign carry = {1'b0, carry_temp[WIDTH-1:1], 1'b0};     
+   
+// endmodule // csa
--- a/wally-pipelined/src/fpu/exception_div.sv
+++ b/wally-pipelined/src/fpu/exception_div.sv
@ -1,38 +1,36 @@
 // Exception logic for the floating point adder. Note: We may 
 // actually want to move to where the result is computed.
-
 module exception_div (Ztype, Invalid, Denorm, ANorm, BNorm, A, B, op_type);

-   input [63:0] A;		// 1st input operand (op1)
-   input [63:0] B;		// 2nd input operand (op2)
-   input 	op_type;        // Determine operation   
+   input logic [63:0] A;		// 1st input operand (op1)
+   input logic [63:0] B;		// 2nd input operand (op2)
+   input logic 	      op_type;          // Determine operation   
   
-   output [2:0] Ztype;		// Indicates type of result (Z)
-   output 	Invalid;	// Invalid operation exception
-   output 	Denorm;		// Denormalized input
-   output       ANorm;          // A is not zero or Denorm
-   output       BNorm;          // B is not zero or Denorm
+   output logic [2:0] Ztype;		// Indicates type of result (Z)
+   output logic       Invalid;	        // Invalid operation exception
+   output logic       Denorm;		// Denormalized input
+   output logic       ANorm;            // A is not zero or Denorm
+   output logic       BNorm;            // B is not zero or Denorm
   
-   wire		AzeroM;	 	// '1' if the mantissa of A is zero
-   wire		BzeroM;		// '1' if the mantissa of B is zero
-   wire		AzeroE;	 	// '1' if the exponent of A is zero
-   wire		BzeroE;		// '1' if the exponent of B is zero
-   wire		AonesE;	 	// '1' if the exponent of A is all ones
-   wire		BonesE;		// '1' if the exponent of B is all ones
-   wire		ADenorm; 	// '1' if A is a denomalized number
-   wire		BDenorm; 	// '1' if B is a denomalized number
-   wire		AInf;	 	// '1' if A is infinite
-   wire		BInf;	 	// '1' if B is infinite
-   wire		AZero;	 	// '1' if A is 0
-   wire		BZero;	 	// '1' if B is 0
-   wire		ANaN;	 	// '1' if A is a not-a-number
-   wire		BNaN; 		// '1' if B is a not-a-number
-   wire		ASNaN;	 	// '1' if A is a signalling not-a-number
-   wire		BSNaN;	 	// '1' if B is a signalling not-a-number
-   wire		ZQNaN;	 	// '1' if result Z is a quiet NaN
-   wire		ZInf;	 	// '1' if result Z is an infnity
-   wire 	square_root;    // '1' if square root operation
-   wire 	Zero;           // '1' if result is zero   
+   logic 	      AzeroM;	 	// '1' if the mantissa of A is zero
+   logic 	      BzeroM;		// '1' if the mantissa of B is zero
+   logic 	      AzeroE;	 	// '1' if the exponent of A is zero
+   logic 	      BzeroE;		// '1' if the exponent of B is zero
+   logic 	      AonesE;	 	// '1' if the exponent of A is all ones
+   logic 	      BonesE;		// '1' if the exponent of B is all ones
+   logic 	      ADenorm; 	        // '1' if A is a denomalized number
+   logic 	      BDenorm; 	        // '1' if B is a denomalized number
+   logic 	      AInf;	 	// '1' if A is infinite
+   logic 	      BInf;	 	// '1' if B is infinite
+   logic 	      AZero;	 	// '1' if A is 0
+   logic 	      BZero;	 	// '1' if B is 0
+   logic 	      ANaN;	 	// '1' if A is a not-a-number
+   logic 	      BNaN; 		// '1' if B is a not-a-number
+   logic 	      ASNaN;	 	// '1' if A is a signalling not-a-number
+   logic 	      BSNaN;	 	// '1' if B is a signalling not-a-number
+   logic 	      ZQNaN;	 	// '1' if result Z is a quiet NaN
+   logic 	      ZInf;	 	// '1' if result Z is an infnity
+   logic 	      Zero;             // '1' if result is zero   
   
   parameter [51:0]  fifty_two_zeros = 52'h0; // Use parameter?

@ -93,4 +91,3 @@ module exception_div (Ztype, Invalid, Denorm, ANorm, BNorm, A, B, op_type);
   assign Ztype[2] = BZero&~op_type;   

 endmodule // exception
-
--- a/wally-pipelined/src/fpu/faddcvt.sv
+++ b/wally-pipelined/src/fpu/faddcvt.sv
@ -0,0 +1,417 @@
+//
+// File name : fpadd
+// Title     : Floating-Point Adder/Subtractor
+// project   : FPU
+// Library   : fpadd
+// Author(s) : James E. Stine, Jr., Brett Mathis
+// Purpose   : definition of main unit to floating-point add/sub
+// notes :   
+//
+// Copyright Oklahoma State University
+// Copyright AFRL
+//
+// Basic and Denormalized Operations
+//
+// Step 1: Load operands, set flags, and convert SP to DP
+// Step 2: Check for special inputs ( +/- Infinity,  NaN)
+// Step 3: Compare exponents.  Swap the operands of exp1 < exp2
+//         or of (exp1 = exp2 AND mnt1 < mnt2)
+// Step 4: Shift the mantissa corresponding to the smaller exponent, 
+//          and extend precision by three bits to the right.
+// Step 5: Add or subtract the mantissas.
+// Step 6: Normalize the result.//
+//   Shift left until normalized.  Normalized when the value to the 
+//   left of the binrary point is 1.
+// Step 7: Round the result.// 
+// Step 8: Put sum onto output.
+//
+
+module faddcvt(
+   input logic          clk,
+   input logic          reset,
+   input logic          FlushM,
+   input logic          StallM,
+   input logic  [63:0]  FSrcXE,		// 1st input operand (A)
+   input logic  [63:0]  FSrcYE,		// 2nd input operand (B)
+   input logic  [3:0]   FOpCtrlE, FOpCtrlM,	// Function opcode
+   input logic          FmtE, FmtM,   		// Result Precision (0 for double, 1 for single)
+   input logic  [2:0] 	FrmM,		// Rounding mode - specify values 
+   output logic [63:0]  FAddResM,	// Result of operation
+   output logic [4:0]   FAddFlgM);   	// IEEE exception flags 
+   
+   logic [63:0] 	AddSumE, AddSumM;
+   logic [63:0]   AddSumTcE, AddSumTcM;
+   logic [3:0] 	AddSelInvE, AddSelInvM;
+   logic [10:0] 	AddExpPostSumE,AddExpPostSumM;
+   logic 		   AddCorrSignE, AddCorrSignM;
+   logic          AddOp1NormE, AddOp1NormM;
+   logic          AddOp2NormE, AddOp2NormM;
+   logic          AddOpANormE,  AddOpANormM;
+   logic          AddOpBNormE, AddOpBNormM;
+   logic          AddInvalidE, AddInvalidM;
+   logic 		   AddDenormInE, AddDenormInM;
+   logic          AddSwapE, AddSwapM;
+   logic          AddNormOvflowE, AddNormOvflowM; //***this isn't used in addcvt2
+   logic          AddSignAE, AddSignAM;
+   logic 		   AddConvertE, AddConvertM;
+   logic [63:0] 	AddFloat1E, AddFloat2E, AddFloat1M, AddFloat2M;
+   logic [11:0] 	AddExp1DenormE, AddExp2DenormE, AddExp1DenormM, AddExp2DenormM;
+   logic [10:0] 	AddExponentE, AddExponentM;
+
+
+   fpuaddcvt1 fpadd1 (.FSrcXE, .FSrcYE, .FOpCtrlE, .FmtE, .AddFloat1E, .AddFloat2E, .AddExponentE, 
+                     .AddExpPostSumE, .AddExp1DenormE, .AddExp2DenormE, .AddSumE, .AddSumTcE, .AddSelInvE, 
+                     .AddCorrSignE, .AddSignAE, .AddOp1NormE, .AddOp2NormE, .AddOpANormE, .AddOpBNormE, .AddInvalidE, 
+                     .AddDenormInE, .AddConvertE, .AddSwapE, .AddNormOvflowE);
+
+   flopenrc #(64) EMRegAdd1(clk, reset, FlushM, ~StallM, AddSumE, AddSumM); 
+   flopenrc #(64) EMRegAdd2(clk, reset, FlushM, ~StallM, AddSumTcE, AddSumTcM); 
+   flopenrc #(11) EMRegAdd3(clk, reset, FlushM, ~StallM, AddExpPostSumE, AddExpPostSumM); 
+   flopenrc #(64) EMRegAdd4(clk, reset, FlushM, ~StallM, AddFloat1E, AddFloat1M); 
+   flopenrc #(64) EMRegAdd5(clk, reset, FlushM, ~StallM, AddFloat2E, AddFloat2M); 
+   flopenrc #(12) EMRegAdd6(clk, reset, FlushM, ~StallM, AddExp1DenormE, AddExp1DenormM); 
+   flopenrc #(12) EMRegAdd7(clk, reset, FlushM, ~StallM, AddExp2DenormE, AddExp2DenormM); 
+   flopenrc #(11) EMRegAdd8(clk, reset, FlushM, ~StallM, AddExponentE, AddExponentM);
+   flopenrc #(15) EMRegAdd9(clk, reset, FlushM, ~StallM, 
+                           {AddSelInvE, AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE, AddDenormInE, AddConvertE, AddSwapE, AddNormOvflowE, AddSignAE},
+                           {AddSelInvM, AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM, AddDenormInM, AddConvertM, AddSwapM, AddNormOvflowM, AddSignAM}); 
+
+                     
+   fpuaddcvt2 fpadd2 (.FrmM, .FOpCtrlM, .FmtM, .AddSumM, .AddSumTcM, .AddFloat1M, .AddFloat2M, 
+                     .AddExp1DenormM, .AddExp2DenormM, .AddExponentM, .AddExpPostSumM, .AddSelInvM, 
+                     .AddOp1NormM, .AddOp2NormM, .AddOpANormM, .AddOpBNormM, .AddInvalidM, .AddDenormInM, 
+                     .AddSignAM, .AddCorrSignM, .AddConvertM, .AddSwapM, .FAddResM, .FAddFlgM);
+endmodule
+
+module fpuaddcvt1 (AddSumE, AddSumTcE, AddSelInvE, AddExpPostSumE, AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE, AddDenormInE, AddConvertE, AddSwapE, AddNormOvflowE, AddSignAE, AddFloat1E, AddFloat2E, AddExp1DenormE, AddExp2DenormE, AddExponentE, FSrcXE, FSrcYE, FOpCtrlE, FmtE);
+
+   input logic [63:0] FSrcXE;		// 1st input operand (A)
+   input logic [63:0] FSrcYE;		// 2nd input operand (B)
+   input logic [3:0]	FOpCtrlE;	// Function opcode
+   input logic 	FmtE;   		// Result Precision (1 for double, 0 for single)
+
+   wire          P;
+   assign P = ~FmtE;
+
+   wire [63:0] 	 IntValue;
+   wire [11:0] 	 exp1, exp2;
+   wire [11:0] 	 exp_diff1, exp_diff2;
+   wire [11:0] 	 exp_shift;
+   wire [51:0] 	 mantissaA;
+   wire [56:0] 	 mantissaA1;
+   wire [63:0] 	 mantissaA3;
+   wire [51:0] 	 mantissaB; 
+   wire [56:0] 	 mantissaB1, mantissaB2;
+   wire [63:0] 	 mantissaB3;
+   wire 	 exp_gt63;
+   wire 	 Sticky_out;
+   wire          sub;
+   wire 	 zeroB;
+   wire [5:0]	 align_shift; 
+
+   output logic [63:0] 	 AddFloat1E; 
+   output logic [63:0] 	 AddFloat2E;
+   output logic [10:0] 	 AddExponentE;
+   output logic [10:0]	 AddExpPostSumE;
+   output logic [11:0]	 AddExp1DenormE, AddExp2DenormE;//KEP used to be [10:0]
+   output logic [63:0] AddSumE, AddSumTcE;
+   output logic [3:0]  AddSelInvE;
+   output logic        AddCorrSignE;
+   output logic 	 AddSignAE;
+   output logic	 AddOp1NormE, AddOp2NormE;
+   output logic	 AddOpANormE, AddOpBNormE;
+   output logic	 AddInvalidE;
+   output logic 	 AddDenormInE;
+//   output logic 	 exp_valid;
+   output logic 	 AddConvertE;
+   output logic        AddSwapE;
+   output logic 	 AddNormOvflowE;
+   wire [5:0]	 ZP_mantissaA;
+   wire [5:0]	 ZP_mantissaB;
+   wire		 ZV_mantissaA;
+   wire		 ZV_mantissaB;
+
+   // Convert the input operands to their appropriate forms based on 
+   // the orignal operands, the FOpCtrlE , and their precision P. 
+   // Single precision inputs are converted to double precision 
+   // and the sign of the first operand is set appropratiately based on
+   // if the operation is absolute value or negation. 
+
+   convert_inputs conv1 (AddFloat1E, AddFloat2E, FSrcXE, FSrcYE, FOpCtrlE, P);
+
+   // Test for exceptions and return the "Invalid Operation" and
+   // "Denormalized" Input Flags. The "AddSelInvE" is used in
+   // the third pipeline stage to select the result. Also, AddOp1NormE
+   // and AddOp2NormE are one if FSrcXE and FSrcYE are not zero or denormalized.
+   // sub is one if the effective operation is subtaction. 
+
+   exception exc1 (AddSelInvE, AddInvalidE, AddDenormInE, AddOp1NormE, AddOp2NormE, sub, 
+		   AddFloat1E, AddFloat2E, FOpCtrlE);
+
+   // Perform Exponent Subtraction (used for alignment). For performance
+   // both exponent subtractions are performed in parallel. This was 
+   // changed to a behavior level to allow the tools to  try to optimize
+   // the two parallel additions. The input values are zero-extended to 12 
+   // bits prior to performing the addition. 
+
+   assign exp1 = {1'b0, AddFloat1E[62:52]};
+   assign exp2 = {1'b0, AddFloat2E[62:52]};
+   assign exp_diff1 = exp1 - exp2;
+   assign exp_diff2 = AddDenormInE ? ({AddFloat2E[63], exp2[10:0]} - {AddFloat1E[63], exp1[10:0]}): exp2 - exp1;
+
+   // The second operand (B) should be set to zero, if FOpCtrlE does not
+   // specify addition or subtraction
+   assign zeroB = FOpCtrlE[2] | FOpCtrlE[1];
+
+   // Swapped operands if zeroB is not one and exp1 < exp2. 
+   // Swapping causes exp2 to be used for the result exponent. 
+   // Only the exponent of the larger operand is used to determine
+   // the final result. 
+   assign AddSwapE = exp_diff1[11] & ~zeroB;
+   assign AddExponentE = AddSwapE ? exp2[10:0] : exp1[10:0];
+   assign AddExpPostSumE = AddSwapE ? exp2[10:0] : exp1[10:0];
+   assign mantissaA = AddSwapE ? AddFloat2E[51:0] : AddFloat1E[51:0];
+   assign mantissaB = AddSwapE ? AddFloat1E[51:0] : AddFloat2E[51:0];
+   assign AddSignAE     = AddSwapE ? AddFloat2E[63] : AddFloat1E[63];   
+
+   // Leading-Zero Detector. Determine the size of the shift needed for
+   // normalization. If sum_corrected is all zeros, the exp_valid is 
+   // zero; otherwise, it is one. 
+   // modified to 52 bits to detect leading zeroes on denormalized mantissas
+   lz52 lz_norm_1 (ZP_mantissaA, ZV_mantissaA, mantissaA);
+   lz52 lz_norm_2 (ZP_mantissaB, ZV_mantissaB, mantissaB);
+
+   // Denormalized exponents created by subtracting the leading zeroes from the original exponents
+   assign AddExp1DenormE = AddSwapE ? (exp1 - {6'b0, ZP_mantissaB}) : (exp1 - {6'b0, ZP_mantissaA}); //KEP extended ZP_mantissa 
+   assign AddExp2DenormE = AddSwapE ? (exp2 - {6'b0, ZP_mantissaA}) : (exp2 - {6'b0, ZP_mantissaB});
+
+   // Determine the alignment shift and limit it to 63. If any bit from 
+   // exp_shift[6] to exp_shift[11] is one, then shift is set to all ones. 
+   assign exp_shift = AddSwapE ? exp_diff2 : exp_diff1;
+   assign exp_gt63 = exp_shift[11] | exp_shift[10] | exp_shift[9] 
+     | exp_shift[8] | exp_shift[7] | exp_shift[6];
+   assign align_shift = exp_shift[5:0] | {6{exp_gt63}}; //KEP used to be all of exp_shift
+
+   // Unpack the 52-bit mantissas to 57-bit numbers of the form.
+   //    001.M[51]M[50] ... M[1]M[0]00
+   // Unless the number has an exponent of zero, in which case it
+   // is unpacked as
+   //    000.00 ... 00
+   // This effectively flushes denormalized values to zero. 
+   // The three bits of to the left of the binary point prevent overflow
+   // and loss of sign information. The two bits to the right of the 
+   // original mantissa form the "guard" and "round" bits that are used
+   // to round the result. 
+   assign AddOpANormE = AddSwapE ? AddOp2NormE : AddOp1NormE;
+   assign AddOpBNormE = AddSwapE ? AddOp1NormE : AddOp2NormE;
+   assign mantissaA1 = {2'h0, AddOpANormE, mantissaA[51:0]&{52{AddOpANormE}}, 2'h0};
+   assign mantissaB1 = {2'h0, AddOpBNormE, mantissaB[51:0]&{52{AddOpBNormE}}, 2'h0};
+
+   // Perform mantissa alignment using a 57-bit barrel shifter 
+   // If any of the bits shifted out are one, Sticky_out is set. 
+   // The size of the barrel shifter could be reduced by two bits
+   // by not adding the leading two zeros until after the shift. 
+   barrel_shifter_r57 bs1 (mantissaB2, Sticky_out, mantissaB1, align_shift);
+
+   // Place either the sign-extened 32-bit value or the original 64-bit value 
+   // into IntValue (to be used for integer to floating point conversion)
+   // assign IntValue [31:0] = FSrcXE[31:0];
+   // assign IntValue [63:32] = FOpCtrlE[0] ? {32{FSrcXE[31]}} : FSrcXE[63:32];
+
+   // If doing an integer to floating point conversion, mantissaA3 is set to 
+   // IntVal and the prenomalized exponent is set to 1084. Otherwise, 
+   // mantissaA3 is simply extended to 64-bits by setting the 7 LSBs to zero, 
+   // and the exponent value is left unchanged. 
+   // Under denormalized cases, the exponent before the rounder is set to 1
+   // if the normal shift value is 11.
+   assign AddConvertE       = ~FOpCtrlE[2] & FOpCtrlE[1];
+   assign mantissaA3    = (FOpCtrlE[3]) ? (FOpCtrlE[0] ? AddFloat1E : ~AddFloat1E) : (AddDenormInE ? ({12'h0, mantissaA}) : (AddConvertE ? IntValue : {mantissaA1, 7'h0}));
+
+   // Put zero in for mantissaB3, if zeroB is one. Otherwise, B is extended to 
+   // 64-bits by setting the 7 LSBs to the Sticky_out bit followed by six  
+   // zeros. 
+   assign mantissaB3[63:7] = (FOpCtrlE[3]) ? (57'h0) : (AddDenormInE ? {12'h0, mantissaB[51:7]} : mantissaB2 & {57{~zeroB}});
+   assign mantissaB3[6]    = (FOpCtrlE[3]) ? (1'b0) : (AddDenormInE ? mantissaB[6] : Sticky_out & ~zeroB);
+   assign mantissaB3[5:0]  = (FOpCtrlE[3]) ? (6'h01) : (AddDenormInE ? mantissaB[5:0] : 6'h0);
+
+   // The sign of the result needs to be corrected if the true
+   // operation is subtraction and the input operands were swapped. 
+   assign AddCorrSignE = ~FOpCtrlE[2]&~FOpCtrlE[1]&FOpCtrlE[0]&AddSwapE;
+
+   // 64-bit Mantissa Adder/Subtractor
+   cla64 add1 (AddSumE, mantissaA3, mantissaB3, sub); //***adder
+
+   // 64-bit Mantissa Subtractor - to get the two's complement of the 
+   // result when the sign from the adder/subtractor is negative. 
+   cla_sub64 sub1 (AddSumTcE, mantissaB3, mantissaA3); //***adder
+ 
+   // Finds normal underflow result to determine whether to round final exponent down
+   //***KEP used to be (AddSumE == 16'h0) I am unsure what it's supposed to be
+   assign AddNormOvflowE = (AddDenormInE & (AddSumE == 64'h0) & (AddOpANormE | AddOpBNormE) & ~FOpCtrlE[0]) ? 1'b1 : (AddSumE[63] ? AddSumTcE[52] : AddSumE[52]);
+
+endmodule // fpadd
+
+
+//
+// File name : fpadd
+// Title     : Floating-Point Adder/Subtractor
+// project   : FPU
+// Library   : fpadd
+// Author(s) : James E. Stine, Jr., Brett Mathis
+// Purpose   : definition of main unit to floating-point add/sub
+// notes :   
+//
+// Copyright Oklahoma State University
+// Copyright AFRL
+//
+// Basic and Denormalized Operations
+//
+// Step 1: Load operands, set flags, and AddConvertM SP to DP
+// Step 2: Check for special inputs ( +/- Infinity,  NaN)
+// Step 3: Compare exponents.  Swap the operands of exp1 < exp2
+//         or of (exp1 = exp2 AND mnt1 < mnt2)
+// Step 4: Shift the mantissa corresponding to the smaller AddExponentM, 
+//          and extend precision by three bits to the right.
+// Step 5: Add or subtract the mantissas.
+// Step 6: Normalize the result.//
+//   Shift left until normalized.  Normalized when the value to the 
+//   left of the binrary point is 1.
+// Step 7: Round the result.// 
+// Step 8: Put AddSumM onto output.
+//
+
+
+module fpuaddcvt2 (FAddResM, FAddFlgM, AddSumM, AddSumTcM, AddSelInvM, AddExpPostSumM, AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM, AddDenormInM, AddConvertM, AddSwapM, AddSignAM, AddFloat1M, AddFloat2M, AddExp1DenormM, AddExp2DenormM, AddExponentM, FrmM, FOpCtrlM, FmtM);
+
+   input [2:0] 	FrmM;		// Rounding mode - specify values 
+   input [3:0]	FOpCtrlM;	// Function opcode
+   input 	FmtM;   		// Result Precision (0 for double, 1 for single)
+   // input 	AddOvEnM;		// Overflow trap enabled
+   // input 	AddUnEnM;   	// Underflow trap enabled
+   input [63:0] AddSumM, AddSumTcM;
+   input [63:0] 	 AddFloat1M; 
+   input [63:0] 	 AddFloat2M;
+   input [11:0]	 AddExp1DenormM, AddExp2DenormM;
+   input [10:0] 	 AddExponentM, AddExpPostSumM; //exp_pre;
+   //input		 exp_valid;
+   input [3:0] 	 AddSelInvM;
+   input		 AddOp1NormM, AddOp2NormM;
+   input		 AddOpANormM, AddOpBNormM;
+   input		 AddInvalidM;
+   input 	 AddDenormInM; 
+   input 	 AddSignAM; 
+   input         AddCorrSignM;
+   input 	 AddConvertM;
+   input          AddSwapM;
+   // input 	 AddNormOvflowM;
+
+   output [63:0] FAddResM;	// Result of operation
+   output [4:0]  FAddFlgM;   	// IEEE exception flags 
+   wire 	 AddDenormM;   	// AddDenormM on input or output   
+
+   wire          P;
+   assign P = ~FmtM;
+
+   wire [10:0]   exp_pre;
+   wire [63:0] 	 Result;   
+   wire [63:0] 	 sum_norm, sum_norm_w_bypass;
+   wire [5:0] 	 norm_shift, norm_shift_denorm;
+   wire          exp_valid;
+   wire		 DenormIO;
+   wire [4:0] 	 FlagsIn;	
+   wire 	 Sticky_out;
+   wire 	 sign_corr;
+   wire 	 zeroB;         
+   wire [10:0]	 AddExpPostSumM;
+   wire 	 mantissa_comp;
+   wire 	 mantissa_comp_sum;
+   wire 	 mantissa_comp_sum_tc;
+   wire 	 Float1_sum_comp;
+   wire 	 Float2_sum_comp;
+   wire 	 Float1_sum_tc_comp;
+   wire 	 Float2_sum_tc_comp;
+   wire 	 normal_underflow;
+   wire [63:0]   sum_corr;
+   logic AddNormOvflowM;
+ 
+ 
+   logic 	AddOvEnM;		// Overflow trap enabled
+   logic 	AddUnEnM;   	// Underflow trap enabled
+
+   assign AddOvEnM = 1'b1;
+   assign AddUnEnM = 1'b1;
+   //AddExponentM value pre-rounding with considerations for denormalized
+   //cases/conversion cases
+   assign exp_pre       = AddDenormInM ?
+                          ((norm_shift == 6'b001011) ? 11'b00000000001 : (AddSwapM ? AddExp2DenormM[10:0] : AddExp1DenormM[10:0]))
+                          : (AddConvertM ? 11'b10000111100 : AddExponentM);
+
+
+   // Finds normal underflow result to determine whether to round final AddExponentM down
+   // Comparison between each float and the resulting AddSumM of the primary cla adder/subtractor and cla subtractor
+   assign Float1_sum_comp = (AddFloat1M[51:0] > AddSumM[51:0]) ? 1'b0 : 1'b1;
+   assign Float2_sum_comp = (AddFloat2M[51:0] > AddSumM[51:0]) ? 1'b0 : 1'b1;
+   assign Float1_sum_tc_comp = (AddFloat1M[51:0] > AddSumTcM[51:0]) ? 1'b0 : 1'b1;
+   assign Float2_sum_tc_comp = (AddFloat2M[51:0] > AddSumTcM[51:0]) ? 1'b0 : 1'b1;
+
+   // Determines the correct Float value to compare based on AddSwapM result
+   assign mantissa_comp_sum = AddSwapM ? Float2_sum_comp : Float1_sum_comp;
+   assign mantissa_comp_sum_tc = AddSwapM ? Float2_sum_tc_comp : Float1_sum_tc_comp;
+
+   // Determines the correct comparison result based on operation and sign of resulting AddSumM
+   assign mantissa_comp = (FOpCtrlM[0] ^ AddSumM[63]) ? mantissa_comp_sum_tc : mantissa_comp_sum;
+
+   // If the signs are different and both operands aren't denormalized
+   // the normal underflow bit is needed and therefore updated.
+   assign normal_underflow = ((AddFloat1M[63] ~^ AddFloat2M[63]) & (AddOpANormM | AddOpBNormM)) ? mantissa_comp : 1'b0;
+
+   // Determine the correct sign of the result
+   assign sign_corr = ((AddCorrSignM ^ AddSignAM) & ~AddConvertM) ^ AddSumM[63];   
+   
+   // If the AddSumM is negative, use its two complement instead. 
+   // This value has to be 64-bits to correctly handle the 
+   // case 10...00
+   assign sum_corr = (AddDenormInM & (AddOpANormM | AddOpBNormM) & ( ( (AddFloat1M[63] ~^ AddFloat2M[63]) & FOpCtrlM[0] ) | ((AddFloat1M[63] ^ AddFloat2M[63]) & ~FOpCtrlM[0]) ))
+			 ? (AddSumM[63] ? AddSumM : AddSumTcM) : ( (FOpCtrlM[3]) ? AddSumM : (AddSumM[63] ? AddSumTcM : AddSumM));
+
+   // Finds normal underflow result to determine whether to round final AddExponentM down
+   //KEP used to be (AddSumM == 16'h0) not sure what it is supposed to be
+   assign AddNormOvflowM = (AddDenormInM & (AddSumM == 64'h0) & (AddOpANormM | AddOpBNormM) & ~FOpCtrlM[0]) ? 1'b1 : (AddSumM[63] ? AddSumTcM[52] : AddSumM[52]);
+
+   // Leading-Zero Detector. Determine the size of the shift needed for
+   // normalization. If sum_corrected is all zeros, the exp_valid is 
+   // zero; otherwise, it is one. 
+   lz64 lzd1 (norm_shift, exp_valid, sum_corr);
+
+   assign norm_shift_denorm = (AddDenormInM & ( (~AddOpANormM & ~AddOpBNormM) | normal_underflow)) ? (6'h00) : (norm_shift);
+
+   // Barell shifter used for normalization. It takes as inputs the 
+   // the corrected AddSumM and the amount by which the AddSumM should 
+   // be right shifted. It outputs the normalized AddSumM. 
+   barrel_shifter_l64 bs2 (sum_norm, sum_corr, norm_shift_denorm);
+  
+   assign sum_norm_w_bypass = (FOpCtrlM[3]) ? (FOpCtrlM[0] ? ~sum_corr : sum_corr) : (sum_norm);
+
+   // Round the mantissa to a 52-bit value, with the leading one
+   // removed. If the result is a single precision number, the actual 
+   // mantissa is in the upper 23 bits and the lower 29 bits are zero. 
+   // At this point, normalization has already been performed, so we know 
+   // exactly where the rounding point is. The rounding units also
+   // handles special cases and set the exception flags.
+
+   // Changed DenormIO -> AddDenormM and FlagsIn -> FAddFlgM in order to
+   // help in processor reservation station detection of load/stores. In
+   // other words, the processor would like to know ahead of time that
+   // if the result is an exception then don't load or store.
+   rounder round1 (Result, DenormIO, FlagsIn, FrmM, P, AddOvEnM, AddUnEnM, exp_valid, 
+		   AddSelInvM, AddInvalidM, AddDenormInM, AddConvertM, sign_corr, exp_pre, norm_shift, sum_norm_w_bypass,
+		   AddExpPostSumM, AddOp1NormM, AddOp2NormM, AddFloat1M[63:52], AddFloat2M[63:52],
+		   AddNormOvflowM, normal_underflow, AddSwapM, FOpCtrlM, AddSumM);
+
+   // Store the final result and the exception flags in registers.
+   assign FAddResM = Result;
+   assign {AddDenormM, FAddFlgM} = {DenormIO, FlagsIn};
+   
+endmodule // fpadd
+
+
--- a/wally-pipelined/src/fpu/fclassify.sv
+++ b/wally-pipelined/src/fpu/fclassify.sv
@ -2,49 +2,52 @@
 `include "wally-config.vh"

 module fclassify (
-    input  logic [63:0] SrcXE,
-    input  logic        FmtE,           // 0-Single 1-Double
+    input  logic XSgnE,
+    input  logic [51:0] XFracE,
+    input logic XNaNE, 
+    input logic XSNaNE,
+    input logic XNormE,
+    input logic XDenormE,
+    input logic XZeroE,
+    input logic XInfE,
+    // input  logic        FmtE,           // 0-Single 1-Double
    output logic [63:0] ClassResE
    );

-    logic [31:0] Single;
-    logic [63:0] Double;
-    logic Sgn;
-    logic Inf, NaN, Zero, Norm, Denorm;
-    logic PInf, QNaN, PZero, PNorm, PDenorm;
-    logic NInf, SNaN, NZero, NNorm, NDenorm;
-    logic MaxExp, ExpZero, ManZero, FirstBitFrac;
+    // logic XSgnE;
+    // logic Inf, NaN, Zero, Norm, Denorm;
+    logic PInf, PZero, PNorm, PDenorm;
+    logic NInf, NZero, NNorm, NDenorm;
+    // logic MaxExp, ExpZero, ManZero, FirstBitFrac;
   
    // Single and Double precision layouts
-    assign Single = SrcXE[63:32];
-    assign Double = SrcXE;
-    assign Sgn = SrcXE[63];
+    // assign XSgnE = FmtE ? FSrcXE[63] : FSrcXE[31];

    // basic calculations for readabillity
    
-    assign ExpZero = FmtE ? ~|Double[62:52] : ~|Single[30:23];
-    assign MaxExp = FmtE ? &Double[62:52] : &Single[30:23];
-    assign ManZero = FmtE ? ~|Double[51:0] : ~|Single[22:0];
-    assign FirstBitFrac = FmtE ? Double[51] : Single[22];
+    // assign ExpZero = FmtE ? ~|FSrcXE[62:52] : ~|FSrcXE[30:23];
+    // assign MaxExp = FmtE ? &FSrcXE[62:52] : &FSrcXE[30:23];
+    // assign ManZero = FmtE ? ~|FSrcXE[51:0] : ~|FSrcXE[22:0];
+    // assign FirstBitFrac = FmtE ? FSrcXE[51] : FSrcXE[22];

    // determine the type of number
-    assign NaN      = MaxExp & ~ManZero;
-    assign Inf = MaxExp & ManZero;
-    assign Zero     = ExpZero & ManZero;
-    assign Denorm= ExpZero & ~ManZero;
-    assign Norm   = ~ExpZero;
+    // assign NaN      = MaxExp & ~ManZero;
+    // assign Inf = MaxExp & ManZero;
+    // assign Zero     = ExpZero & ManZero;
+    // assign Denorm= ExpZero & ~ManZero;
+    // assign Norm   = ~ExpZero;

    // determine the sub categories
-    assign QNaN = FirstBitFrac&NaN;
-    assign SNaN = ~FirstBitFrac&NaN;
-    assign PInf = ~Sgn&Inf;
-    assign NInf = Sgn&Inf;
-    assign PNorm = ~Sgn&Norm;
-    assign NNorm = Sgn&Norm;
-    assign PDenorm = ~Sgn&Denorm;
-    assign NDenorm = Sgn&Denorm;
-    assign PZero = ~Sgn&Zero;
-    assign NZero = Sgn&Zero;
+    // assign QNaN = FirstBitFrac&NaN;
+    // assign SNaN = ~FirstBitFrac&NaN;
+    assign PInf = ~XSgnE&XInfE;
+    assign NInf = XSgnE&XInfE;
+    assign PNorm = ~XSgnE&XNormE;
+    assign NNorm = XSgnE&XNormE;
+    assign PDenorm = ~XSgnE&XDenormE;
+    assign NDenorm = XSgnE&XDenormE;
+    assign PZero = ~XSgnE&XZeroE;
+    assign NZero = XSgnE&XZeroE;

    // determine sub category and combine into the result
    //  bit 0 - -Inf
@ -57,6 +60,6 @@ module fclassify (
    //  bit 7 - +Inf
    //  bit 8 - signaling NaN
    //  bit 9 - quiet NaN
-    assign ClassResE = {{54{1'b0}}, QNaN, SNaN, PInf, PNorm,  PDenorm, PZero, NZero, NDenorm, NNorm, NInf};
+    assign ClassResE = {{54{1'b0}}, XNaNE&~XSNaNE, XSNaNE, PInf, PNorm,  PDenorm, PZero, NZero, NDenorm, NNorm, NInf};

 endmodule
--- a/wally-pipelined/src/fpu/fcmp.sv
+++ b/wally-pipelined/src/fpu/fcmp.sv
@ -42,28 +42,32 @@
 module fcmp (   
   input logic [63:0] op1, 
   input logic [63:0] op2,
+   input logic XNaNE, YNaNE,
+   input logic XZeroE, YZeroE,
+   input logic [63:0] FSrcXE,
+   input logic [63:0] FSrcYE,
   input logic [2:0]  FOpCtrlE,
   input logic 	      FmtE,

   
   output logic       Invalid, 		 // Invalid Operation
-   // output logic [1:0] FCC,  		 // Condition Codes 
   output logic [63:0] CmpResE);
+
   // Perform magnitude comparison between the 63 least signficant bits
   // of the input operands. Only LT and EQ are returned, since GT can
   // be determined from these values. 
   logic [1:0] FCC;  		 // Condition Codes 
   logic [7:0]	      w, x;
-   logic	      ANaN, BNaN;
-   logic	      Azero, Bzero;
+   // logic	      ANaN, BNaN;
+   // logic	      Azero, Bzero;
   logic 	      LT;                // magnitude op1 < magnitude op2
   logic 	      EQ;                // magnitude op1 = magnitude op2
-   
+
+
   magcompare64b_1 magcomp1 (w, x, {~op1[63], op1[62:0]}, {~op2[63], op2[62:0]});

   // Determine final values based on output of magnitude comparison, 
   // sign bits, and special case testing. 
-   exception_cmp_1 exc1 (ANaN, BNaN, Azero, Bzero, op1, op2, FOpCtrlE);
   
   // Perform magnitude comparison between the 63 least signficant bits
   // of the input operands. Only LT and EQ are returned, since GT can
@ -72,24 +76,10 @@ module fcmp (

   // Determine final values based on output of magnitude comparison, 
   // sign bits, and special case testing. 
-   exception_cmp_2 exc2 (.invalid(Invalid), .fcc(FCC), .LT_mag(LT), .EQ_mag(EQ), .ANaN(ANaN), .BNaN(BNaN), .Azero(Azero), .Bzero(Bzero), .FOpCtrlE(FOpCtrlE), .A(op1), .B(op2), .*);
+   exception_cmp_2 exc2 (.invalid(Invalid), .fcc(FCC), .LT_mag(LT), .EQ_mag(EQ), .ANaN(XNaNE), .BNaN(YNaNE), .Azero(XZeroE), .Bzero(YZeroE), .FOpCtrlE(FOpCtrlE), .A(op1), .B(op2), .FSrcXE, .FSrcYE, .*);

 endmodule // fpcomp

-// module magcompare2b (LT, GT, A, B);
-
-//    input logic [1:0] A;
-//    input logic [1:0] B;
-   
-//    output logic     LT;
-//    output logic     GT;
-
-//    // Determine if A < B  using a minimized sum-of-products expression
-//    assign LT = ~A[1]&B[1] | ~A[1]&~A[0]&B[0] | ~A[0]&B[1]&B[0];
-//    // Determine if A > B  using a minimized sum-of-products expression
-//    assign GT = A[1]&~B[1] | A[1]&A[0]&~B[0] | A[0]&~B[1]&~B[0];
-
-// endmodule // magcompare2b

 // 2-bit magnitude comparator
 // This module compares two 2-bit values A and B. LT is '1' if A < B 
@ -195,135 +185,6 @@ module magcompare64b_1 (w, x,  A, B);

 endmodule // magcompare64b

-// This module takes 64-bits inputs A and B, two magnitude comparison
-// flags LT_mag and EQ_mag, and a 2-bit signal FOpCtrlE that indicates the type of 
-// operands being compared as indicated below.
-//	FOpCtrlE	Description
-//	 00	double precision numbers
-//	 01	single precision numbers
-//	 10	half precision numbers
-//	 11	bfloat precision numbers
-//
-// The comparator produces a 2-bit signal fcc, which
-// indicates the result of the comparison as follows:
-//     fcc 	decscription
-//      00	A = B	
-//      01	A < B	
-//      10	A > B	
-//      11	A and B	are unordered (i.e., A or B is NaN)
-// It also produces a invalid operation flag, which is one
-// if either of the input operands is a signaling NaN.
-
-module exception_cmp_1 (ANaN, BNaN, Azero, Bzero, A, B, FOpCtrlE);
-
-   input logic [63:0] A;
-   input logic [63:0] B;
-   input logic [2:0]  FOpCtrlE;
-
-   logic 		      dp, sp, hp;
-
-   output logic 	      ANaN;
-   output logic 	      BNaN;
-   output logic               Azero;
-   output logic               Bzero;
-
-   assign dp = !FOpCtrlE[1]&!FOpCtrlE[0];
-   assign sp = !FOpCtrlE[1]&FOpCtrlE[0];
-   assign hp = FOpCtrlE[1]&!FOpCtrlE[0];
-
-   // Test if A or B is NaN.
-   assign ANaN = (A[62]&A[61]&A[60]&A[59]&A[58]) & 
-		 ((sp&A[57]&A[56]&A[55]&(A[54]|A[53])) | 
-		 (dp&A[57]&A[56]&A[55]&A[54]&A[53]&A[52]&(A[51]|A[50])) |
-		 (hp&(A[57]|A[56])));
-
-   assign BNaN = (B[62]&B[61]&B[60]&B[59]&B[58]) & 
-		 ((sp&B[57]&B[56]&B[55]&(B[54]|B[53])) | 
-		 (dp&B[57]&B[56]&B[55]&B[54]&B[53]&B[52]&(B[51]|B[50])) |
-		 (hp&(B[57]|B[56])));
-
-   // Test if A is +0 or -0 when viewed as a floating point number (i.e,
-   // the 63 least siginficant bits of A are zero). 
-   // Depending on how this synthesizes, it may work better to replace
-   // this with assign Azero = ~(A[62] | A[61] | ... | A[0])
-   assign Azero = (A[62:0] == 63'h0);
-   assign Bzero = (B[62:0] == 63'h0);
-
-endmodule // exception_cmp
-//
-// File name : fpcomp.v
-// Title     : Floating-Point Comparator
-// project   : FPU
-// Library   : fpcomp
-// Author(s) : James E. Stine
-// Purpose   : definition of main unit to floating-point comparator
-// notes :   
-//
-// Copyright Oklahoma State University
-//
-// Floating Point Comparator (Algorithm)
-//
-// 1.) Performs sign-extension if the inputs are 32-bit integers.
-// 2.) Perform a magnitude comparison on the lower 63 bits of the inputs
-// 3.) Check for special cases (+0=-0, unordered, and infinite values) 
-//     and correct for sign bits
-//
-// This module takes 64-bits inputs op1 and op2, VSS, and VDD
-// signals, and a 2-bit signal FOpCtrlE that indicates the type of 
-// operands being compared as indicated below.
-//	FOpCtrlE	Description
-//	 00	double precision numbers
-//	 01	single precision numbers
-//	 10	half precision numbers
-//	 11	(unused)
-//
-// The comparator produces a 2-bit signal FCC, which
-// indicates the result of the comparison:
-//
-//     fcc 	decscription
-//      00	A = B	
-//      01	A < B	
-//      10	A > B	
-//      11	A and B	are unordered (i.e., A or B is NaN)
-//
-// It also produces an invalid operation flag, which is one
-// if either of the input operands is a signaling NaN per 754
-
-
-/*module magcompare2b (LT, GT, A, B);
-
-   input logic [1:0] A;
-   input logic [1:0] B;
-   
-   output logic     LT;
-   output logic     GT;
-
-   // Determine if A < B  using a minimized sum-of-products expression
-   assign LT = ~A[1]&B[1] | ~A[1]&~A[0]&B[0] | ~A[0]&B[1]&B[0];
-   // Determine if A > B  using a minimized sum-of-products expression
-   assign GT = A[1]&~B[1] | A[1]&A[0]&~B[0] | A[0]&~B[1]&~B[0];
-
-endmodule*/ // magcompare2b
-
-// 2-bit magnitude comparator
-// This module compares two 2-bit values A and B. LT is '1' if A < B 
-// and GT is '1'if A > B. LT and GT are both '0' if A = B.  However,
-// this version actually incorporates don't cares into the equation to
-// simplify the optimization
-
-// module magcompare2c (LT, GT, A, B);
-
-//    input logic [1:0] A;
-//    input logic [1:0] B;
-   
-//    output logic      LT;
-//    output logic      GT;
-
-//    assign LT = B[1] | (!A[1]&B[0]);
-//    assign GT = A[1] | (!B[1]&A[0]);
-
-// endmodule // magcompare2b
-
 // This module compares two 64-bit values A and B. LT is '1' if A < B 
 // and EQ is '1'if A = B. LT and GT are both '0' if A > B.
 // This structure was modified so
@ -385,6 +246,8 @@ endmodule // magcompare64b
 module exception_cmp_2 (
   input logic [63:0] A,
   input logic [63:0] B,
+   input logic [63:0] FSrcXE,
+   input logic [63:0] FSrcYE,
   input logic 	      FmtE,
   input logic 	      LT_mag,
   input logic 	      EQ_mag,
@ -453,8 +316,8 @@ module exception_cmp_2 (

   always_comb begin
      case (FOpCtrlE[2:0])
-         3'b111: CmpResE = LT ? A : B;//min 
-         3'b101: CmpResE = GT ? A : B;//max
+         3'b111: CmpResE = LT ? FSrcXE : FSrcYE;//min 
+         3'b101: CmpResE = GT ? FSrcXE : FSrcYE;//max
         3'b010: CmpResE = {63'b0, EQ};//equal
         3'b001: CmpResE = {63'b0, LT};//less than
         3'b011: CmpResE = {63'b0, LT|EQ};//less than or equal
--- a/wally-pipelined/src/fpu/fctrl.sv
+++ b/wally-pipelined/src/fpu/fctrl.sv
@ -6,7 +6,7 @@ module fctrl (
  input  logic [2:0] Funct3D,
  input  logic [2:0] FRM_REGW,
  output logic       IllegalFPUInstrD,
-  output logic       FWriteEnD,
+  output logic       FRegWriteD,
  output logic       FDivStartD,
  output logic [2:0] FResultSelD,
  output logic [3:0] FOpCtrlD,
@ -21,7 +21,7 @@ module fctrl (
  // FPU Instruction Decoder
  always_comb
    case(OpD)
-    // FWriteEn_FWriteInt_FResultSel_FOpCtrl_FResSel_FIntResSel_FDivStart_IllegalFPUInstr
+    // FRegWrite_FWriteInt_FResultSel_FOpCtrl_FResSel_FIntResSel_FDivStart_IllegalFPUInstr
      7'b0000111: case(Funct3D)
                    3'b010:  ControlsD = `FCTRLW'b1_0_000_0000_00_00_0_0; // flw
                    3'b011:  ControlsD = `FCTRLW'b1_0_000_0001_00_00_0_0; // fld
@ -64,44 +64,44 @@ module fctrl (
                                else if (Funct3D[1:0] == 2'b00) ControlsD = `FCTRLW'b0_1_100_0100_00_01_0_0; // fmv.x.w
                                else if (Funct3D[1:0] == 2'b01) ControlsD = `FCTRLW'b0_1_100_0101_00_01_0_0; // fmv.x.d
                                else                            ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
-                    7'b1100000: case(Rs2D[1:0])
-                                  2'b00:    ControlsD = `FCTRLW'b0_1_100_0001_00_00_0_0; // fcvt.s.w
-                                  2'b01:    ControlsD = `FCTRLW'b0_1_100_0101_00_00_0_0; // fcvt.s.wu
-                                  2'b10:    ControlsD = `FCTRLW'b0_1_100_1001_00_00_0_0; // fcvt.s.l
-                                  2'b11:    ControlsD = `FCTRLW'b0_1_100_1101_00_00_0_0; // fcvt.s.lu
+                    7'b1101000: case(Rs2D[1:0])
+                                  2'b00:    ControlsD = `FCTRLW'b1_0_100_0001_11_00_0_0; // fcvt.s.w
+                                  2'b01:    ControlsD = `FCTRLW'b1_0_100_0101_11_00_0_0; // fcvt.s.wu
+                                  2'b10:    ControlsD = `FCTRLW'b1_0_100_1001_11_00_0_0; // fcvt.s.l
+                                  2'b11:    ControlsD = `FCTRLW'b1_0_100_1101_11_00_0_0; // fcvt.s.lu
                                  default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
                                endcase
-                    7'b1101000: case(Rs2D[1:0])
-                                  2'b00:    ControlsD = `FCTRLW'b1_1_100_0010_00_00_0_0; // fcvt.w.s
-                                  2'b01:    ControlsD = `FCTRLW'b1_1_100_0110_00_00_0_0; // fcvt.wu.s
-                                  2'b10:    ControlsD = `FCTRLW'b1_1_100_1010_00_00_0_0; // fcvt.l.s
-                                  2'b11:    ControlsD = `FCTRLW'b1_1_100_1110_00_00_0_0; // fcvt.lu.s
+                    7'b1100000: case(Rs2D[1:0])
+                                  2'b00:    ControlsD = `FCTRLW'b0_1_100_0010_11_11_0_0; // fcvt.w.s
+                                  2'b01:    ControlsD = `FCTRLW'b0_1_100_0110_11_11_0_0; // fcvt.wu.s
+                                  2'b10:    ControlsD = `FCTRLW'b0_1_100_1010_11_11_0_0; // fcvt.l.s
+                                  2'b11:    ControlsD = `FCTRLW'b0_1_100_1110_11_11_0_0; // fcvt.lu.s
                                  default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
                                endcase
                    7'b1111000: ControlsD = `FCTRLW'b1_0_100_0000_00_00_0_0; // fmv.w.x
-                    7'b0100000: ControlsD = `FCTRLW'b1_0_010_0000_00_00_0_0; // fcvt.s.d
-                    7'b1100001: case(Rs2D[1:0])
-                                  2'b00:    ControlsD = `FCTRLW'b0_1_100_0001_00_00_0_0; // fcvt.d.w
-                                  2'b01:    ControlsD = `FCTRLW'b0_1_100_0101_00_00_0_0; // fcvt.d.wu
-                                  2'b10:    ControlsD = `FCTRLW'b0_1_100_1001_00_00_0_0; // fcvt.d.l
-                                  2'b11:    ControlsD = `FCTRLW'b0_1_100_1101_00_00_0_0; // fcvt.d.lu
+                    7'b0100000: ControlsD = `FCTRLW'b1_0_010_0111_00_00_0_0; // fcvt.s.d
+                    7'b1101001: case(Rs2D[1:0])
+                                  2'b00:    ControlsD = `FCTRLW'b1_0_100_0001_11_00_0_0; // fcvt.d.w
+                                  2'b01:    ControlsD = `FCTRLW'b1_0_100_0101_11_00_0_0; // fcvt.d.wu
+                                  2'b10:    ControlsD = `FCTRLW'b1_0_100_1001_11_00_0_0; // fcvt.d.l
+                                  2'b11:    ControlsD = `FCTRLW'b1_0_100_1101_11_00_0_0; // fcvt.d.lu
                                  default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
                                endcase
-                    7'b1101001: case(Rs2D[1:0])
-                                  2'b00:    ControlsD = `FCTRLW'b1_0_100_0010_00_00_0_0; // fcvt.w.d
-                                  2'b01:    ControlsD = `FCTRLW'b1_0_100_0110_00_00_0_0; // fcvt.wu.d
-                                  2'b10:    ControlsD = `FCTRLW'b1_0_100_1010_00_00_0_0; // fcvt.l.d
-                                  2'b11:    ControlsD = `FCTRLW'b1_0_100_1110_00_00_0_0; // fcvt.lu.d
+                    7'b1100001: case(Rs2D[1:0])
+                                  2'b00:    ControlsD = `FCTRLW'b0_1_100_0010_11_11_0_0; // fcvt.w.d
+                                  2'b01:    ControlsD = `FCTRLW'b0_1_100_0110_11_11_0_0; // fcvt.wu.d
+                                  2'b10:    ControlsD = `FCTRLW'b0_1_100_1010_11_11_0_0; // fcvt.l.d
+                                  2'b11:    ControlsD = `FCTRLW'b0_1_100_1110_11_11_0_0; // fcvt.lu.d
                                  default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
                                endcase
                    7'b1111001: ControlsD = `FCTRLW'b1_0_100_0001_00_00_0_0; // fmv.d.x
-                    7'b0100001: ControlsD = `FCTRLW'b1_0_100_0000_00_00_0_0; // fcvt.d.s
+                    7'b0100001: ControlsD = `FCTRLW'b1_0_010_0111_00_00_0_0; // fcvt.d.s
                    default:    ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
                  endcase
      default:      ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
    endcase
  // unswizzle control bits
-  assign {FWriteEnD, FWriteIntD, FResultSelD, FOpCtrlD, FResSelD, FIntResSelD, FDivStartD, IllegalFPUInstrD} = ControlsD;
+  assign {FRegWriteD, FWriteIntD, FResultSelD, FOpCtrlD, FResSelD, FIntResSelD, FDivStartD, IllegalFPUInstrD} = ControlsD;
  
  // if dynamic rounding, choose FRM_REGW
  assign FrmD = &Funct3D ? FRM_REGW : Funct3D;
@ -109,7 +109,7 @@ module fctrl (
  // Precision
  //  0-single
  //  1-double
-  assign FmtD = FResultSelD == 3'b000 ? Funct3D[0] : Funct7D[0];
+  assign FmtD = FResultSelD == 3'b000 ? Funct3D[0] : OpD[6:1] == 6'b010000 ? ~Funct7D[0] : Funct7D[0];
  // div/sqrt
      //  fdiv  = ???0
      //  fsqrt = ???1
--- a/wally-pipelined/src/fpu/fcvt.sv
+++ b/wally-pipelined/src/fpu/fcvt.sv
@ -0,0 +1,190 @@
+
+`include "wally-config.vh"
+module fcvt (
+	input logic        XSgnE,
+    input logic [10:0] XExpE,
+    input logic [51:0] XFracE,
+    input logic XAssumed1E,
+    input logic XZeroE,
+    input logic XNaNE,
+    input logic XInfE,
+    input logic XDenormE,
+    input logic [10:0] BiasE,
+    input logic [`XLEN-1:0] SrcAE,  // integer input
+    input logic [3:0] FOpCtrlE,     // chooses which instruction is done (full list below)
+    input logic [2:0] FrmE,         // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
+    input logic FmtE,               // precision 1 = double 0 = single
+    output logic [63:0] CvtResE,    // convert final result
+    output logic [4:0] CvtFlgE);     // convert flags {invalid, divide by zero, overflow, underflow, inexact}
+
+    logic               ResSgn; // FP result's sign
+    logic [10:0]        ResExp,TmpExp; // FP result's exponent
+    logic [51:0]        ResFrac;    // FP result's fraction
+    logic [5:0]         LZResP;     // lz output
+    logic [7:0]         Bits;       // how many bits are in the integer result
+    logic [7:0]         SubBits;    // subtract these bits from the exponent (FP result)
+    logic [64+51:0]  ShiftedManTmp; // Shifted mantissa
+    logic [64+51:0]  ShiftVal;       // value being shifted (to int - XMan, to FP - |integer input|)
+    logic [64+1:0]   ShiftedMan;     // shifted mantissa truncated
+    logic [64:0]	    RoundedTmp;     // full size rounded result - in case of overfow
+    logic [63:0]	    Rounded;        // rounded result
+    logic [12:0]        ExpVal;         // unbiased X exponent
+    logic [12:0]        ShiftCnt;       // how much is the mantissa shifted
+	logic [64-1:0]   IntIn;          // trimed integer input
+    logic [64-1:0]   PosInt;         // absolute value of the integer input
+    logic [63:0]        CvtIntRes;      // interger result from the fp -> int instructions
+    logic [63:0]        CvtFPRes;       // floating point result from the int -> fp instructions
+    logic               Of, Uf;         // did the integer result underflow or overflow
+    logic               Guard, Round, LSB, Sticky;  // bits used to determine rounding
+    logic               Plus1,CalcPlus1;    // do you add one for rounding
+    logic               SgnRes;             // sign of the floating point result
+    logic               Res64, In64;        // is the result or input 64 bits
+    logic               RoundMSB;           // most significant bit of the fraction
+    logic               RoundSgn;           // sign of the rounded result
+
+    // FOpCtrlE:
+      //  fcvt.w.s  = 0010
+      //  fcvt.wu.s = 0110
+      //  fcvt.s.w  = 0001
+      //  fcvt.s.wu = 0101
+      //  fcvt.l.s  = 1010
+      //  fcvt.lu.s = 1110
+      //  fcvt.s.l  = 1001
+      //  fcvt.s.lu = 1101
+      //  fcvt.w.d  = 0010 
+      //  fcvt.wu.d = 0110
+      //  fcvt.d.w  = 0001
+      //  fcvt.d.wu = 0101
+      //  fcvt.l.d  = 1010
+      //  fcvt.lu.d = 1110
+      //  fcvt.d.l  = 1001
+      //  fcvt.d.lu = 1101
+      //  {long, unsigned, to int, from int}
+   
+    // calculate signals based off the input and output's size
+    // assign Bias = FmtE ? 12'h3ff : 12'h7f;
+    assign Res64 = ((FOpCtrlE==4'b1010 || FOpCtrlE==4'b1110) | (FmtE&(FOpCtrlE==4'b0001 | FOpCtrlE==4'b0101 | FOpCtrlE==4'b0000 | FOpCtrlE==4'b1001 | FOpCtrlE==4'b1101)));
+    assign In64 = ((FOpCtrlE==4'b1001 || FOpCtrlE==4'b1101) | (FmtE&(FOpCtrlE==4'b0010 | FOpCtrlE==4'b0110 | FOpCtrlE==4'b1010 | FOpCtrlE==4'b1110) | (FOpCtrlE==4'b1101 & ~FmtE)));
+    assign SubBits = In64 ? 8'd64 : 8'd32;
+    assign Bits = Res64 ? 8'd64 : 8'd32;
+
+    // calulate the unbiased exponent
+    assign ExpVal = XExpE - BiasE + XDenormE;
+
+////////////////////////////////////////////////////////
+
+    // position the input in the most significant bits
+    assign IntIn = FOpCtrlE[3] ? {SrcAE, {64-`XLEN{1'b0}}} : {SrcAE[31:0], 32'b0};
+    // make the integer positive
+    assign PosInt = IntIn[64-1]&~FOpCtrlE[2] ? -IntIn : IntIn;
+    // determine the integer's sign
+    assign ResSgn = ~FOpCtrlE[2] ? IntIn[64-1] : 1'b0;
+    
+    // generate
+    //     if(`XLEN == 64) 
+    //         lz64 lz(LZResP, LZResV, PosInt);
+    //     else if(`XLEN == 32) begin
+    //         assign LZResP[5] = 1'b0;
+    //         lz32 lz(LZResP[4:0], LZResV, PosInt);
+    //     end 
+    // endgenerate
+
+	// Leading one detector
+	logic [8:0]	i;
+	always_comb begin
+			i = 0;
+			while (~PosInt[64-1-i] && i < `XLEN) i = i+1;  // search for leading one 
+			LZResP = i+1;    // compute shift count
+	end
+
+    // if no one was found set to zero otherwise calculate the exponent
+    assign TmpExp = i==`XLEN ? 0 : BiasE + SubBits - LZResP;
+
+
+
+
+////////////////////////////////////////////
+
+
+    // select the shift value and amount based on operation (to fp or int)
+    assign ShiftCnt = FOpCtrlE[1] ? ExpVal : LZResP;
+    assign ShiftVal = FOpCtrlE[1] ? {{64-2{1'b0}}, XAssumed1E, XFracE} : {PosInt, 52'b0};
+
+	// if shift = -1 then shift one bit right for gaurd bit (right shifting twice never rounds)
+	// if the shift is negitive add a bit for sticky bit calculation
+	// otherwise shift left
+    assign ShiftedManTmp = &ShiftCnt ? {{64-1{1'b0}}, XAssumed1E, XFracE[51:1]} : ShiftCnt[12] ? {{64+51{1'b0}}, ~XZeroE} : ShiftVal << ShiftCnt;
+
+    // truncate the shifted mantissa
+    assign ShiftedMan = ShiftedManTmp[64+51:50];
+
+    // calculate sticky bit 
+    //  - take into account the possible right shift from before
+    //  - the sticky bit calculation covers three diffrent sizes depending on the opperation
+    assign Sticky = |ShiftedManTmp[49:0] | &ShiftCnt&XFracE[0] | (FOpCtrlE[0]&|ShiftedManTmp[62:50]) | (FOpCtrlE[0]&~FmtE&|ShiftedManTmp[91:63]);
+
+    
+    // determine guard, round, and least significant bit of the result
+    assign Guard = FOpCtrlE[1] ? ShiftedMan[1] : FmtE ? ShiftedMan[13] : ShiftedMan[42];
+    assign Round = FOpCtrlE[1] ? ShiftedMan[0] : FmtE ? ShiftedMan[12] : ShiftedMan[41];
+    assign LSB = FOpCtrlE[1] ? ShiftedMan[2] : FmtE ? ShiftedMan[14] : ShiftedMan[43];
+
+    always_comb begin
+        // Determine if you add 1
+        case (FrmE)
+            3'b000: CalcPlus1 = Guard & (Round | Sticky | (~Round&~Sticky&LSB));//round to nearest even
+            3'b001: CalcPlus1 = 0;//round to zero
+            3'b010: CalcPlus1 = (XSgnE&FOpCtrlE[1]) | (ResSgn&FOpCtrlE[0]);//round down
+            3'b011: CalcPlus1 = (~XSgnE&FOpCtrlE[1]) | (~ResSgn&FOpCtrlE[0]);//round up
+            3'b100: CalcPlus1 = Guard & (Round | Sticky | (~Round&~Sticky));//round to nearest max magnitude
+            default: CalcPlus1 = 1'bx;
+        endcase
+    end
+
+    // dont tound if the result is exact
+    assign Plus1 = CalcPlus1 & (Guard|Round|Sticky)&~(XZeroE&FOpCtrlE[1]);
+
+    // round the shifted mantissa
+    assign RoundedTmp = ShiftedMan[64+1:2] + Plus1;
+    assign {ResExp, ResFrac} = FmtE ? {TmpExp, ShiftedMan[64+1:14]} + Plus1 :  {{TmpExp, ShiftedMan[64+1:43]} + Plus1, 29'b0} ;
+
+    // fit the rounded result into the appropriate size and take the 2's complement if needed
+     assign Rounded = Res64 ? XSgnE&FOpCtrlE[1] ? -RoundedTmp[63:0] : RoundedTmp[63:0] : 
+			      XSgnE ? {{32{1'b1}}, -RoundedTmp[31:0]} : {32'b0, RoundedTmp[31:0]};
+
+    // extract the MSB and Sign for later use (will be used to determine underflow and overflow)
+     assign RoundMSB = Res64 ? RoundedTmp[64] : RoundedTmp[32];
+     assign RoundSgn = Res64 ? Rounded[63] : Rounded[31];
+
+
+    // check if the result overflows
+    assign Of = (~XSgnE&($signed(ShiftCnt) >= $signed(Bits))) | (~XSgnE&RoundSgn&~FOpCtrlE[2]) | (RoundMSB&(ShiftCnt==(Bits-1))) | (~XSgnE&XInfE) | XNaNE;
+
+    // check if the result underflows (this calculation changes if the result is signed or unsigned)
+    assign Uf = FOpCtrlE[2] ? XSgnE&~XZeroE | (XSgnE&XInfE) | (XSgnE&~XZeroE&(~ShiftCnt[12]|CalcPlus1)) | (ShiftCnt[12]&Plus1) : (XSgnE&XInfE) | (XSgnE&($signed(ShiftCnt) >= $signed(Bits))) | (XSgnE&~RoundSgn&~ShiftCnt[12]);    // assign CvtIntRes =  (XSgnE | ShiftCnt[12]) ? {64{1'b0}}  : (ShiftCnt >= 64) ? {64{1'b1}} : Rounded;
+    
+    // calculate the result's sign
+    assign SgnRes = ~FOpCtrlE[3] & FOpCtrlE[1];
+
+    // select the integer result
+    assign CvtIntRes = Of ? FOpCtrlE[2] ? {64{1'b1}} : SgnRes ? {33'b0, {31{1'b1}}}: {1'b0, {63{1'b1}}} : 
+                    Uf ? FOpCtrlE[2] ? 64'b0 : SgnRes ? {32'b0, 1'b1, 31'b0} : {1'b1, 63'b0} :
+		            Rounded[64-1:0];
+
+    // select the floating point result            
+    assign CvtFPRes = FmtE ? {ResSgn, ResExp, ResFrac} : {{32{1'b1}}, ResSgn, ResExp[7:0], ResFrac[51:29]};
+
+    // select the result
+    assign CvtResE = FOpCtrlE[0] ? CvtFPRes : CvtIntRes;
+
+    // calculate the flags
+    //      - to int only sets the invalid flag
+    //      - from int only sets the inexact flag
+    assign CvtFlgE = {(Of | Uf)&FOpCtrlE[1], 3'b0, (Guard|Round|Sticky)&FOpCtrlE[0]};
+
+
+
+
+endmodule // fpadd
+
+
--- a/wally-pipelined/src/fpu/fhazard.sv
+++ b/wally-pipelined/src/fpu/fhazard.sv
@ -27,40 +27,40 @@

 module fhazard(
    input logic [4:0] Adr1E, Adr2E, Adr3E,
-    input logic FWriteEnM, FWriteEnW, 
+    input logic FRegWriteM, FRegWriteW, 
 	  input logic [4:0] RdM, RdW,
    input logic [2:0] FResultSelM,
    output logic FStallD,
-    output logic [1:0] ForwardXE, ForwardYE, ForwardZE
+    output logic [1:0] FForwardXE, FForwardYE, FForwardZE
 );


  always_comb begin
    // set ReadData as default
-    ForwardXE = 2'b00; // choose FRD1E
-    ForwardYE = 2'b00; // choose FRD2E
-    ForwardZE = 2'b00; // choose FRD3E
+    FForwardXE = 2'b00; // choose FRD1E
+    FForwardYE = 2'b00; // choose FRD2E
+    FForwardZE = 2'b00; // choose FRD3E
    FStallD = 0;

-      if ((Adr1E == RdM) & FWriteEnM)
+      if ((Adr1E == RdM) & FRegWriteM)
      // if the result will be FResM
-        if(FResultSelM == 3'b100) ForwardXE = 2'b10; // choose FResM
+        if(FResultSelM == 3'b100) FForwardXE = 2'b10; // choose FResM
        else FStallD = 1;   // if the result won't be ready stall
-      else if ((Adr1E == RdW) & FWriteEnW) ForwardXE = 2'b01; // choose FPUResult64W
+      else if ((Adr1E == RdW) & FRegWriteW) FForwardXE = 2'b01; // choose FPUResult64W
    

-      if ((Adr2E == RdM) & FWriteEnM)
+      if ((Adr2E == RdM) & FRegWriteM)
      // if the result will be FResM
-        if(FResultSelM == 3'b100) ForwardYE = 2'b10; // choose FResM
+        if(FResultSelM == 3'b100) FForwardYE = 2'b10; // choose FResM
        else FStallD = 1;   // if the result won't be ready stall
-      else if ((Adr2E == RdW) & FWriteEnW) ForwardYE = 2'b01; // choose FPUResult64W
+      else if ((Adr2E == RdW) & FRegWriteW) FForwardYE = 2'b01; // choose FPUResult64W

 
-      if ((Adr3E == RdM) & FWriteEnM)
+      if ((Adr3E == RdM) & FRegWriteM)
      // if the result will be FResM
-        if(FResultSelM == 3'b100) ForwardZE = 2'b10; // choose FResM
+        if(FResultSelM == 3'b100) FForwardZE = 2'b10; // choose FResM
        else FStallD = 1;   // if the result won't be ready stall
-      else if ((Adr3E == RdW) & FWriteEnW) ForwardZE = 2'b01; // choose FPUResult64W
+      else if ((Adr3E == RdW) & FRegWriteW) FForwardZE = 2'b01; // choose FPUResult64W

  end 

--- a/wally-pipelined/src/fpu/fma2.sv
+++ b/wally-pipelined/src/fpu/fma2.sv
@ -1,10 +1,177 @@
+module fma(
+    input logic             clk,
+    input logic             reset,
+    input logic             FlushM,
+    input logic             StallM,
+    input logic             FmtE, FmtM,       // precision 1 = double 0 = single
+    input logic  [2:0]      FOpCtrlM, FOpCtrlE,   // 000 = fmadd (X*Y)+Z,  001 = fmsub (X*Y)-Z,  010 = fnmsub -(X*Y)+Z,  011 = fnmadd -(X*Y)-Z,  100 = fmul (X*Y)
+    input logic  [2:0]      FrmM,       // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
+    input logic        XSgnE, YSgnE, ZSgnE,
+    input logic [10:0] XExpE, YExpE, ZExpE,
+    input logic [51:0] XFracE, YFracE, ZFracE,
+    input logic        XSgnM, YSgnM, ZSgnM,
+    input logic [10:0] XExpM, YExpM, ZExpM,
+    input logic [51:0] XFracM, YFracM, ZFracM,
+    input logic        XAssumed1E, YAssumed1E, ZAssumed1E,
+    input logic XDenormE, YDenormE, ZDenormE,
+    input logic XZeroE, YZeroE, ZZeroE,
+    input logic XNaNM, YNaNM, ZNaNM,
+    input logic XSNaNM, YSNaNM, ZSNaNM,
+    input logic XZeroM, YZeroM, ZZeroM,
+    input logic XInfM, YInfM, ZInfM,
+    input logic [10:0] BiasE,
+	output logic [63:0]		FMAResM,
+	output logic [4:0]		FMAFlgM);
+	
+
+    logic [105:0]	ProdManE, ProdManM; 
+    logic [161:0]	AlignedAddendE, AlignedAddendM;                       
+    logic [12:0]	ProdExpE, ProdExpM;
+    logic 			AddendStickyE, AddendStickyM;
+    logic 			KillProdE, KillProdM;
+    
+    fma1 fma1 (.XExpE, .YExpE, .ZExpE, .XFracE, .YFracE, .ZFracE, 
+                .BiasE, .XAssumed1E, .YAssumed1E, .ZAssumed1E, .XDenormE, .YDenormE, .ZDenormE,  .XZeroE, .YZeroE, .ZZeroE,
+                .FOpCtrlE, .FmtE, .ProdManE, .AlignedAddendE,
+                .ProdExpE, .AddendStickyE, .KillProdE); 
+                
+    flopenrc #(106) EMRegFma1(clk, reset, FlushM, ~StallM, ProdManE, ProdManM); 
+    flopenrc #(162) EMRegFma2(clk, reset, FlushM, ~StallM, AlignedAddendE, AlignedAddendM); 
+    flopenrc #(13) EMRegFma3(clk, reset, FlushM, ~StallM, ProdExpE, ProdExpM);  
+    flopenrc #(2) EMRegFma4(clk, reset, FlushM, ~StallM, 
+                            {AddendStickyE, KillProdE},
+                            {AddendStickyM, KillProdM});
+
+    fma2 fma2(.XSgnM, .YSgnM, .ZSgnM, .XExpM, .YExpM, .ZExpM, .XFracM, .YFracM, .ZFracM, 
+            .FOpCtrlM, .FrmM, .FmtM, 
+            .ProdManM, .AlignedAddendM, .ProdExpM, .AddendStickyM, .KillProdM, 
+            .XZeroM, .YZeroM, .ZZeroM, .XInfM, .YInfM, .ZInfM, .XNaNM, .YNaNM, .ZNaNM, .XSNaNM, .YSNaNM, .ZSNaNM,
+            .FMAResM, .FMAFlgM);
+
+endmodule
+      
+
+
+module fma1(
+    // input logic        XSgnE, YSgnE, ZSgnE,
+    input logic [10:0] XExpE, YExpE, ZExpE,
+    input logic [51:0] XFracE, YFracE, ZFracE,
+    input logic        XAssumed1E, YAssumed1E, ZAssumed1E,
+    input logic        XDenormE, YDenormE, ZDenormE,
+    input logic XZeroE, YZeroE, ZZeroE,
+    input logic [10:0] BiasE,
+    input logic     [2:0]       FOpCtrlE,   // 000 = fmadd (X*Y)+Z,  001 = fmsub (X*Y)-Z,  010 = fnmsub -(X*Y)+Z,  011 = fnmadd -(X*Y)-Z,  100 = fmul (X*Y)
+    input logic                 FmtE,       // precision 1 = double 0 = single
+    output logic    [105:0]     ProdManE,   // 1.X frac * 1.Y frac
+    output logic    [161:0]     AlignedAddendE, // Z aligned for addition
+    output logic    [12:0]      ProdExpE,       // X exponent + Y exponent - bias
+    output logic                AddendStickyE,  // sticky bit that is calculated during alignment
+    output logic                KillProdE      // set the product to zero before addition if the product is too small to matter
+    );
+
+    logic [12:0]    AlignCnt;           // how far to shift the addend to align with the product
+    logic [213:0]   ZManShifted;                // output of the alignment shifter including sticky bit
+    logic [213:0]   ZManPreShifted;     // input to the alignment shifter
+    
+    
+    ///////////////////////////////////////////////////////////////////////////////
+    // Calculate the product
+    //      - When multipliying two fp numbers, add the exponents
+    //      - Subtract the bias (XExp + YExp has two biases, one from each exponent)
+    //      - Denormal numbers have an an exponent value of 1, however they are
+    //        represented with an exponent of 0. add one if there is a denormal number
+    ///////////////////////////////////////////////////////////////////////////////
+   
+    // verilator lint_off WIDTH
+    assign ProdExpE = (XZeroE|YZeroE) ? 13'b0 :
+                 XExpE + YExpE - BiasE + XDenormE + YDenormE;
+
+    // Calculate the product's mantissa
+    //      - Add the assumed one. If the number is denormalized or zero, it does not have an assumed one.
+    assign ProdManE =  {XAssumed1E, XFracE} * {YAssumed1E, YFracE};
+
+
+
+
+
+
+
+
+   
+    ///////////////////////////////////////////////////////////////////////////////
+    // Alignment shifter
+    ///////////////////////////////////////////////////////////////////////////////
+
+    // determine the shift count for alignment
+    //      - negitive means Z is larger, so shift Z left
+    //      - positive means the product is larger, so shift Z right
+    //      - Denormal numbers have an an exponent value of 1, however they are
+    //        represented with an exponent of 0. add one to the exponent if it is a denormal number
+    assign AlignCnt = ProdExpE - ZExpE - ZDenormE;
+    // verilator lint_on WIDTH
+
+
+    // Defualt Addition without shifting
+    //          |   55'b0    |  106'b(product)  | 2'b0 |
+    //                       |1'b0| addnend |
+
+    // the 1'b0 before the added is because the product's mantissa has two bits before the binary point (xx.xxxxxxxxxx...)
+    assign ZManPreShifted = {55'b0, {ZAssumed1E, ZFracE}, 106'b0};
+    always_comb
+        begin
+           
+        // If the product is too small to effect the sum, kill the product
+
+        //          |   54'b0    |  106'b(product)  | 2'b0 |
+        //  | addnend |
+        if ($signed(AlignCnt) <= $signed(-13'd56)) begin
+            KillProdE = 1;
+            ZManShifted = ZManPreShifted;//{107'b0, {~ZAssumed1E, ZFrac}, 54'b0};
+            AddendStickyE = ~(XZeroE|YZeroE);
+
+        // If the Addend is shifted left (negitive AlignCnt)
+
+        //          |   54'b0    |  106'b(product)  | 2'b0 |
+        //                  | addnend |
+        end else if($signed(AlignCnt) <= $signed(13'd0))  begin
+            KillProdE = 0;
+            ZManShifted = ZManPreShifted << -AlignCnt;
+            AddendStickyE = |(ZManShifted[51:0]);
+
+        // If the Addend is shifted right (positive AlignCnt)
+
+        //          |   54'b0    |  106'b(product)  | 2'b0 |
+        //                                  | addnend |
+        end else if ($signed(AlignCnt)<=$signed(13'd106))  begin
+            KillProdE = 0;
+            ZManShifted = ZManPreShifted >> AlignCnt;
+            AddendStickyE = |(ZManShifted[51:0]);
+
+        // If the addend is too small to effect the addition        
+        //      - The addend has to shift two past the end of the addend to be considered too small
+        //      - The 2 extra bits are needed for rounding
+
+        //          |   54'b0    |  106'b(product)  | 2'b0 |
+        //                                                      | addnend |
+        end else begin
+            KillProdE = 0;
+            ZManShifted = 0;
+            AddendStickyE = ~ZZeroE;
+
+        end
+    end
+
+   
+    assign AlignedAddendE = ZManShifted[213:52];
+
+endmodule


 module fma2(
- 
-    input logic     [63:0]      X,  // X
-    input logic     [63:0]      Y,  // Y
-    input logic     [63:0]      Z,  // Z
+    
+    input logic        XSgnM, YSgnM, ZSgnM,
+    input logic [10:0] XExpM, YExpM, ZExpM,
+    input logic [51:0] XFracM, YFracM, ZFracM,
    input logic     [2:0]       FrmM,       // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
    input logic     [2:0]       FOpCtrlM,   // 000 = fmadd (X*Y)+Z,  001 = fmsub (X*Y)-Z,  010 = fnmsub -(X*Y)+Z,  011 = fnmadd -(X*Y)-Z,  100 = fmul (X*Y)
    input logic                 FmtM,       // precision 1 = double 0 = single
@ -16,6 +183,7 @@ module fma2(
    input logic                 XZeroM, YZeroM, ZZeroM, // inputs are zero
    input logic                 XInfM, YInfM, ZInfM,    // inputs are infinity
    input logic                 XNaNM, YNaNM, ZNaNM,    // inputs are NaN
+    input logic                 XSNaNM, YSNaNM, ZSNaNM,    // inputs are signaling NaNs
    output logic    [63:0]      FMAResM,     // FMA final result
    output logic    [4:0]       FMAFlgM);     // FMA flags {invalid, divide by zero, overflow, underflow, inexact}
   
@ -24,8 +192,6 @@ module fma2(
    logic [51:0]    ResultFrac; // Result fraction
    logic [10:0]    ResultExp;  // Result exponent
    logic           ResultSgn;  // Result sign
-    logic [10:0]    ZExp;   // input exponent
-    logic           XSgn, YSgn, ZSgn;   // input sign
    logic           PSgn;       // product sign
    logic [105:0]   ProdMan2;   // product being added
    logic [162:0]   AlignedAddend2; // possibly inverted aligned Z
@ -61,28 +227,10 @@ module fma2(
    logic [63:0] XNaNResult, YNaNResult, ZNaNResult, InvalidResult, OverflowResult, KillProdResult, UnderflowResult; // possible results

   
-    ///////////////////////////////////////////////////////////////////////////////
-    // Select input fields
-    // The following logic duplicates fma1 because it's cheaper to recompute than provide registers
-    ///////////////////////////////////////////////////////////////////////////////
-
-    // Set addend to zero if FMUL instruction
-    assign Addend = FOpCtrlM[2] ? 64'b0 : Z;
-
-    // split inputs into the sign bit, and exponent to handle single or double precision
-    //      - single precision is in the top half of the inputs
-    assign XSgn = X[63];
-    assign YSgn = Y[63];
-    assign ZSgn = Addend[63]^FOpCtrlM[0]; //Negate Z if subtraction
-
-    assign ZExp = FmtM ? Addend[62:52] : {3'b0, Addend[62:55]};
-
-
-
-
+    
    // Calculate the product's sign
    //      Negate product's sign if FNMADD or FNMSUB
-    assign PSgn = XSgn ^ YSgn ^ FOpCtrlM[1];
+    assign PSgn = XSgnM ^ YSgnM ^ FOpCtrlM[1];



@ -93,7 +241,7 @@ module fma2(
    // Negate Z  when doing one of the following opperations:
    //      -prod +  Z
    //       prod -  Z
-    assign InvZ = ZSgn ^ PSgn;
+    assign InvZ = ZSgnM ^ PSgn;

    // Choose an inverted or non-inverted addend - the one is added later
    assign AlignedAddend2 = InvZ ? ~{1'b0, AlignedAddendM} : {1'b0, AlignedAddendM};
@ -148,7 +296,7 @@ module fma2(
    assign FracLen = FmtM ? 13'd52 : 13'd23;

    // Determine if the result is denormal
-    assign SumExpTmp = KillProdM ? {2'b0, ZExp} : ProdExpM + -({4'b0, NormCnt} - 13'd56);
+    assign SumExpTmp = KillProdM ? {2'b0, ZExpM} : ProdExpM + -({4'b0, NormCnt} - 13'd56);
    assign ResultDenorm = $signed(SumExpTmp)<=0 & ($signed(SumExpTmp)>=$signed(-FracLen)) & ~SumZero;

    // Determine the shift needed for denormal results
@ -273,13 +421,13 @@ module fma2(
    // Determine the sign if the sum is zero
    //      if cancelation then 0 unless round to -infinity
    //      otherwise psign
-    assign ZeroSgn = (PSgn^ZSgn)&~Underflow ? FrmM == 3'b010 : PSgn;
+    assign ZeroSgn = (PSgn^ZSgnM)&~Underflow ? FrmM == 3'b010 : PSgn;

    // is the result negitive
    //  if p - z is the Sum negitive
    //  if -p + z is the Sum positive
    //  if -p - z then the Sum is negitive
-    assign ResultSgnTmp = InvZ&(ZSgn)&NegSum | InvZ&PSgn&~NegSum | ((ZSgn)&PSgn);
+    assign ResultSgnTmp = InvZ&(ZSgnM)&NegSum | InvZ&PSgn&~NegSum | ((ZSgnM)&PSgn);
    assign ResultSgn = SumZero ? ZeroSgn : ResultSgnTmp;
 

@ -297,9 +445,8 @@ module fma2(
    //   2) Inf - Inf (unless x or y is NaN)
    //   3) 0 * Inf
    assign MaxExp = FmtM ? 13'd2047 : 13'd255;
-    assign SigNaN = FmtM ? (XNaNM&~X[51]) | (YNaNM&~Y[51]) | (ZNaNM&~Addend[51]) :
-                           (XNaNM&~X[54]) | (YNaNM&~Y[54]) | (ZNaNM&~Addend[54]);
-    assign Invalid = SigNaN | ((XInfM || YInfM) & ZInfM & (PSgn ^ ZSgn) & ~XNaNM & ~YNaNM) | (XZeroM & YInfM) | (YZeroM & XInfM);  
+    assign SigNaN = XSNaNM | YSNaNM | ZSNaNM;
+    assign Invalid = SigNaN | ((XInfM || YInfM) & ZInfM & (PSgn ^ ZSgnM) & ~XNaNM & ~YNaNM) | (XZeroM & YInfM) | (YZeroM & XInfM);  
   
    // Set Overflow flag if the number is too big to be represented
    //      - Don't set the overflow flag if an overflowed result isn't outputed
@ -327,28 +474,28 @@ module fma2(
    ///////////////////////////////////////////////////////////////////////////////
    // Select the result
    ///////////////////////////////////////////////////////////////////////////////
-    assign XNaNResult = FmtM ? {XSgn, X[62:52], 1'b1,X[50:0]} : {XSgn, X[62:55], 1'b1,X[53:0]};
-    assign YNaNResult = FmtM ? {YSgn, Y[62:52], 1'b1,Y[50:0]} : {YSgn, Y[62:55], 1'b1,Y[53:0]};
-    assign ZNaNResult = FmtM ? {ZSgn, Addend[62:52], 1'b1,Addend[50:0]} : {ZSgn, Addend[62:55], 1'b1,Addend[53:0]};
+    assign XNaNResult = FmtM ? {XSgnM, XExpM, 1'b1, XFracM[50:0]} : {{32{1'b1}}, XSgnM, XExpM[7:0], 1'b1, XFracM[50:29]};
+    assign YNaNResult = FmtM ? {YSgnM, YExpM, 1'b1, YFracM[50:0]} : {{32{1'b1}}, YSgnM, YExpM[7:0], 1'b1, YFracM[50:29]};
+    assign ZNaNResult = FmtM ? {ZSgnM, ZExpM, 1'b1, ZFracM[50:0]} : {{32{1'b1}}, ZSgnM, ZExpM[7:0], 1'b1, ZFracM[50:29]};
    assign OverflowResult =  FmtM ? ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {ResultSgn, 11'h7fe, {52{1'b1}}} :
                                                                                                                          {ResultSgn, 11'h7ff, 52'b0} :
-                                    ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {ResultSgn, 8'hfe, {23{1'b1}}, 32'b0} :
-                                                                                                                          {ResultSgn, 8'hff, 55'b0};
-    assign InvalidResult = FmtM ? {ResultSgn, 11'h7ff, 1'b1, 51'b0} : {ResultSgn, 8'hff, 1'b1, 54'b0};
-    assign KillProdResult = FmtM ?{ResultSgn, Addend[62:0] - {62'b0, (Minus1&AddendStickyM)}} + {62'b0, (Plus1&AddendStickyM)} : {ResultSgn, Addend[62:32] - {30'b0, (Minus1&AddendStickyM)} + {30'b0, (Plus1&AddendStickyM)}, 32'b0};
-    assign UnderflowResult = FmtM ? {ResultSgn, 63'b0} + {63'b0, (CalcPlus1&(AddendStickyM|FrmM[1]))} : {{ResultSgn, 31'b0} + {31'b0, (CalcPlus1&(AddendStickyM|FrmM[1]))}, 32'b0};
+                                    ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {{32{1'b1}}, ResultSgn, 8'hfe, {23{1'b1}}} :
+                                                                                                                          {{32{1'b1}}, ResultSgn, 8'hff, 23'b0};
+    assign InvalidResult = FmtM ? {ResultSgn, 11'h7ff, 1'b1, 51'b0} : {{32{1'b1}}, ResultSgn, 8'hff, 1'b1, 22'b0};
+    assign KillProdResult = FmtM ? {ResultSgn, {ZExpM, ZFracM} - {62'b0, (Minus1&AddendStickyM)}} + {62'b0, (Plus1&AddendStickyM)} : {{32{1'b1}}, ResultSgn, {ZExpM[7:0], ZFracM[51:29]} - {30'b0, (Minus1&AddendStickyM)} + {30'b0, (Plus1&AddendStickyM)}};
+    assign UnderflowResult = FmtM ? {ResultSgn, 63'b0} + {63'b0, (CalcPlus1&(AddendStickyM|FrmM[1]))} : {{32{1'b1}}, {ResultSgn, 31'b0} + {31'b0, (CalcPlus1&(AddendStickyM|FrmM[1]))}};
    assign FMAResM = XNaNM ? XNaNResult :
                        YNaNM ? YNaNResult :
                        ZNaNM ? ZNaNResult :
                        Invalid ? InvalidResult : // has to be before inf
-                        XInfM ? {PSgn, X[62:0]} :
-                        YInfM ? {PSgn, Y[62:0]} :
-                        ZInfM ? {ZSgn, Addend[62:0]} :
+                        XInfM ? FmtM ? {PSgn, XExpM, XFracM} : {{32{1'b1}}, PSgn, XExpM[7:0], XFracM[51:29]} :
+                        XInfM ? FmtM ? {PSgn, YExpM, YFracM} : {{32{1'b1}}, PSgn, YExpM[7:0], YFracM[51:29]} :
+                        XInfM ? FmtM ? {ZSgnM, ZExpM, ZFracM} : {{32{1'b1}}, ZSgnM, ZExpM[7:0], ZFracM[51:29]} :
                        Overflow ? OverflowResult :
                        KillProdM ? KillProdResult : // has to be after Underflow      
                        Underflow & ~ResultDenorm ? UnderflowResult :  
                        FmtM ? {ResultSgn, ResultExp, ResultFrac} :
-                               {ResultSgn, ResultExp[7:0], ResultFrac, 3'b0};
+                               {{32{1'b1}}, ResultSgn, ResultExp[7:0], ResultFrac[51:29]};



--- a/wally-pipelined/src/fpu/fma1.sv
+++ b/wally-pipelined/src/fpu/fma1.sv
@ -1,184 +0,0 @@
-module fma1(
- 
-    input logic     [63:0]      X,  // X
-    input logic     [63:0]      Y,  // Y
-    input logic     [63:0]      Z,  // Z
-    input logic     [2:0]       FOpCtrlE,   // 000 = fmadd (X*Y)+Z,  001 = fmsub (X*Y)-Z,  010 = fnmsub -(X*Y)+Z,  011 = fnmadd -(X*Y)-Z,  100 = fmul (X*Y)
-    input logic                 FmtE,       // precision 1 = double 0 = single
-    output logic    [105:0]     ProdManE,   // 1.X frac * 1.Y frac
-    output logic    [161:0]     AlignedAddendE, // Z aligned for addition
-    output logic    [12:0]      ProdExpE,       // X exponent + Y exponent - bias
-    output logic                AddendStickyE,  // sticky bit that is calculated during alignment
-    output logic                KillProdE,      // set the product to zero before addition if the product is too small to matter
-    output logic                XZeroE, YZeroE, ZZeroE, // inputs are zero
-    output logic                XInfE, YInfE, ZInfE,    // inputs are infinity
-    output logic                XNaNE, YNaNE, ZNaNE);   // inputs are NaN
-
-    logic [51:0]    XFrac,YFrac,ZFrac;  // input fraction
-    logic [52:0]    XMan,YMan,ZMan;     // input mantissa (with leading one)
-    logic [12:0]    XExp,YExp,ZExp;     // input exponents
-    logic           XSgn,YSgn,ZSgn;     // input signs
-    logic [12:0]    AlignCnt;           // how far to shift the addend to align with the product
-    logic [213:0]   ZManShifted;                // output of the alignment shifter including sticky bit
-    logic [213:0]   ZManPreShifted;     // input to the alignment shifter
-    logic           XDenorm, YDenorm, ZDenorm;  // inputs are denormal
-    logic [63:0]    Addend; // value to add (Z or zero)
-    logic [12:0]    Bias;   // 1023 for double, 127 for single
-    logic           XExpZero, YExpZero, ZExpZero;   // input exponent zero
-    logic           XFracZero, YFracZero, ZFracZero; // input fraction zero
-    logic           XExpMax, YExpMax, ZExpMax;  // input exponent all 1s
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // split inputs into the sign bit, fraction, and exponent to handle single or double precision
-    //      - single precision is in the top half of the inputs
-    ///////////////////////////////////////////////////////////////////////////////
-
-    // Set addend to zero if FMUL instruction
-    assign Addend = FOpCtrlE[2] ? 64'b0 : Z;
-
-    assign XSgn = X[63];
-    assign YSgn = Y[63];
-    assign ZSgn = Addend[63];
-
-    assign XExp = FmtE ? {2'b0, X[62:52]} : {5'b0, X[62:55]};
-    assign YExp = FmtE ? {2'b0, Y[62:52]} : {5'b0, Y[62:55]};
-    assign ZExp = FmtE ? {2'b0, Addend[62:52]} : {5'b0, Addend[62:55]};
-
-    assign XFrac = FmtE ? X[51:0] : {X[54:32], 29'b0};
-    assign YFrac = FmtE ? Y[51:0] : {Y[54:32], 29'b0};
-    assign ZFrac = FmtE ? Addend[51:0] : {Addend[54:32], 29'b0};
-   
-    assign XMan = {~XExpZero, XFrac};
-    assign YMan = {~YExpZero, YFrac};
-    assign ZMan = {~ZExpZero, ZFrac};
-
-    assign Bias = FmtE ? 13'h3ff : 13'h7f;
-
-
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // determine if an input is a special value
-    ///////////////////////////////////////////////////////////////////////////////
-
-    assign XExpZero = ~|XExp;
-    assign YExpZero = ~|YExp;
-    assign ZExpZero = ~|ZExp;
-   
-    assign XFracZero = ~|XFrac;
-    assign YFracZero = ~|YFrac;
-    assign ZFracZero = ~|ZFrac;
-
-    assign XExpMax = FmtE ? &XExp[10:0] : &XExp[7:0];
-    assign YExpMax = FmtE ? &YExp[10:0] : &YExp[7:0];
-    assign ZExpMax = FmtE ? &ZExp[10:0] : &ZExp[7:0];
-   
-    assign XNaNE = XExpMax & ~XFracZero;
-    assign YNaNE = YExpMax & ~YFracZero;
-    assign ZNaNE = ZExpMax & ~ZFracZero;
-
-    assign XDenorm = XExpZero & ~XFracZero;
-    assign YDenorm = YExpZero & ~YFracZero;
-    assign ZDenorm = ZExpZero & ~ZFracZero;
-
-    assign XInfE = XExpMax & XFracZero;
-    assign YInfE = YExpMax & YFracZero;
-    assign ZInfE = ZExpMax & ZFracZero;
-
-    assign XZeroE = XExpZero & XFracZero;
-    assign YZeroE = YExpZero & YFracZero;
-    assign ZZeroE = ZExpZero & ZFracZero;
-
-
-
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // Calculate the product
-    //      - When multipliying two fp numbers, add the exponents
-    //      - Subtract the bias (XExp + YExp has two biases, one from each exponent)
-    //      - Denormal numbers have an an exponent value of 1, however they are
-    //        represented with an exponent of 0. add one if there is a denormal number
-    ///////////////////////////////////////////////////////////////////////////////
-   
-    // verilator lint_off WIDTH
-    assign ProdExpE = (XZeroE|YZeroE) ? 13'b0 :
-                 XExp + YExp - Bias + XDenorm + YDenorm;
-
-    // Calculate the product's mantissa
-    //      - Add the assumed one. If the number is denormalized or zero, it does not have an assumed one.
-    assign ProdManE =  XMan * YMan;
-
-
-
-
-
-
-
-
-   
-    ///////////////////////////////////////////////////////////////////////////////
-    // Alignment shifter
-    ///////////////////////////////////////////////////////////////////////////////
-
-    // determine the shift count for alignment
-    //      - negitive means Z is larger, so shift Z left
-    //      - positive means the product is larger, so shift Z right
-    //      - Denormal numbers have an an exponent value of 1, however they are
-    //        represented with an exponent of 0. add one to the exponent if it is a denormal number
-    assign AlignCnt = ProdExpE - ZExp - ZDenorm;
-    // verilator lint_on WIDTH
-
-
-    // Defualt Addition without shifting
-    //          |   55'b0    |  106'b(product)  | 2'b0 |
-    //                       |1'b0| addnend |
-
-    // the 1'b0 before the added is because the product's mantissa has two bits before the binary point (xx.xxxxxxxxxx...)
-    assign ZManPreShifted = {55'b0, ZMan, 106'b0};
-    always_comb
-        begin
-           
-        // If the product is too small to effect the sum, kill the product
-
-        //          |   54'b0    |  106'b(product)  | 2'b0 |
-        //  | addnend |
-        if ($signed(AlignCnt) <= $signed(-13'd56)) begin
-            KillProdE = 1;
-            ZManShifted = ZManPreShifted;//{107'b0, ZMan, 54'b0};
-            AddendStickyE = ~(XZeroE|YZeroE);
-
-        // If the Addend is shifted left (negitive AlignCnt)
-
-        //          |   54'b0    |  106'b(product)  | 2'b0 |
-        //                  | addnend |
-        end else if($signed(AlignCnt) <= $signed(13'd0))  begin
-            KillProdE = 0;
-            ZManShifted = ZManPreShifted << -AlignCnt;
-            AddendStickyE = |(ZManShifted[51:0]);
-
-        // If the Addend is shifted right (positive AlignCnt)
-
-        //          |   54'b0    |  106'b(product)  | 2'b0 |
-        //                                  | addnend |
-        end else if ($signed(AlignCnt)<=$signed(13'd106))  begin
-            KillProdE = 0;
-            ZManShifted = ZManPreShifted >> AlignCnt;
-            AddendStickyE = |(ZManShifted[51:0]);
-
-        // If the addend is too small to effect the addition        
-        //      - The addend has to shift two past the end of the addend to be considered too small
-        //      - The 2 extra bits are needed for rounding
-
-        //          |   54'b0    |  106'b(product)  | 2'b0 |
-        //                                                      | addnend |
-        end else begin
-            KillProdE = 0;
-            ZManShifted = 0;
-            AddendStickyE = ~ZZeroE;
-
-        end
-    end
-
-   
-    assign AlignedAddendE = ZManShifted[213:52];
-
-endmodule
--- a/wally-pipelined/src/fpu/fpdiv.sv
+++ b/wally-pipelined/src/fpu/fpdiv.sv
@ -0,0 +1,152 @@
+//
+// File name : fpdiv
+// Title     : Floating-Point Divider/Square-Root
+// project   : FPU
+// Library   : fpdiv
+// Author(s) : James E. Stine, Jr.
+// Purpose   : definition of main unit to floating-point div/sqrt
+// notes :   
+//
+// Copyright Oklahoma State University
+//
+// Basic Operations
+//
+// Step 1: Load operands, set flags, and convert SP to DP
+// Step 2: Check for special inputs ( +/- Infinity,  NaN)
+// Step 3: Exponent Logic
+// Step 4: Divide/Sqrt using Goldschmidt
+// Step 5: Normalize the result.//
+//   Shift left until normalized.  Normalized when the value to the 
+//   left of the binrary point is 1.
+// Step 6: Round the result.// 
+// Step 7: Put quotient/remainder onto output.
+//
+
+// `timescale 1ps/1ps
+module fpdiv (AS_Result, Flags, Denorm, op1, op2, rm, op_type, P, OvEn, UnEn,
+	      start, reset, clk);
+
+   input [63:0] op1;		// 1st input operand (A)
+   input [63:0] op2;		// 2nd input operand (B)
+   input [1:0] 	rm;		// Rounding mode - specify values 
+   input 	op_type;	// Function opcode
+   input 	P;   		// Result Precision (0 for double, 1 for single)
+   input 	OvEn;		// Overflow trap enabled
+   input 	UnEn;   	// Underflow trap enabled
+   input 	start;
+   input 	reset;
+   input 	clk;   
+
+   output [63:0] AS_Result;	// Result of operation
+   output [4:0]  Flags;   	// IEEE exception flags 
+   output 	 Denorm;   	// Denorm on input or output
+   logic 	 done;
+   // output 	 done;
+
+   supply1 	  vdd;
+   supply0 	  vss;   
+
+   wire [63:0] 	 Float1; 
+   wire [63:0] 	 Float2;
+   wire [63:0] 	 IntValue;
+   
+   wire [12:0] 	 exp1, exp2, expF;
+   wire [12:0] 	 exp_diff, bias;
+   wire [13:0] 	 exp_sqrt;
+   wire [12:0] 	 exp_s;
+   wire [12:0] 	 exp_c;
+   
+   wire [10:0] 	 exponent, exp_pre;
+   wire [63:0] 	 Result;   
+   wire [52:0] 	 mantissaA;
+   wire [52:0] 	 mantissaB; 
+   wire [63:0] 	 sum, sum_tc, sum_corr, sum_norm;
+   
+   wire [5:0] 	 align_shift;
+   wire [5:0] 	 norm_shift;
+   wire [2:0] 	 sel_inv;
+   wire		 op1_Norm, op2_Norm;
+   wire		 opA_Norm, opB_Norm;
+   wire		 Invalid;
+   wire 	 DenormIn, DenormIO;
+   wire [4:0] 	 FlagsIn;   	
+   wire 	 exp_gt63;
+   wire 	 Sticky_out;
+   wire 	 signResult, sign_corr;
+   wire          corr_sign;
+   wire 	 zeroB;         
+   wire 	 convert;
+   wire          swap;
+   wire          sub;
+   
+   wire [63:0] 	 q1, qm1, qp1, q0, qm0, qp0;
+   wire [63:0] 	 rega_out, regb_out, regc_out, regd_out;
+   wire [127:0]  regr_out;
+   wire [2:0] 	 sel_muxa, sel_muxb;
+   wire 	 sel_muxr;   
+   wire 	 load_rega, load_regb, load_regc, load_regd, load_regr;
+
+   wire 	 donev, sel_muxrv, sel_muxsv;
+   wire [1:0] 	 sel_muxav, sel_muxbv;   
+   wire 	 load_regav, load_regbv, load_regcv;
+   wire 	 load_regrv, load_regsv;
+   
+   // Convert the input operands to their appropriate forms based on 
+   // the orignal operands, the op_type , and their precision P. 
+   // Single precision inputs are converted to double precision 
+   // and the sign of the first operand is set appropratiately based on
+   // if the operation is absolute value or negation.   
+   convert_inputs_div conv1 (Float1, Float2, op1, op2, op_type, P);
+
+   // Test for exceptions and return the "Invalid Operation" and
+   // "Denormalized" Input Flags. The "sel_inv" is used in
+   // the third pipeline stage to select the result. Also, op1_Norm
+   // and op2_Norm are one if op1 and op2 are not zero or denormalized.
+   // sub is one if the effective operation is subtaction.   
+   exception_div exc1 (sel_inv, Invalid, DenormIn, op1_Norm, op2_Norm, 
+		       Float1, Float2, op_type);
+
+   // Determine Sign/Mantissa
+   assign signResult = ((Float1[63]^Float2[63])&~op_type) | Float1[63]&op_type;
+   assign mantissaA = {vdd, Float1[51:0]};
+   assign mantissaB = {vdd, Float2[51:0]};
+   // Perform Exponent Subtraction - expA - expB + Bias   
+   assign exp1 = {2'b0, Float1[62:52]};
+   assign exp2 = {2'b0, Float2[62:52]};
+   // bias : DP = 2^{11-1}-1 = 1023
+   assign bias = {3'h0, 10'h3FF};
+   // Divide exponent
+   csa #(13) csa1 (exp1, ~exp2, bias, exp_s, exp_c);
+   adder #(14) explogic1 ({vss, exp_s}, {vss, exp_c}, 1'b1, {open, exp_diff}, exp_cout1);
+   
+   // Sqrt exponent (check if exponent is odd)
+   assign exp_odd = Float1[52] ? vss : vdd;
+   adder #(14) explogic2 ({vss, exp1}, {4'h0, 10'h3ff}, exp_odd, exp_sqrt, exp_cout2);
+   // Choose correct exponent
+   assign expF = op_type ? exp_sqrt[13:1] : exp_diff;   
+
+   // Main Goldschmidt/Division Routine   
+   divconv goldy (q1, qm1, qp1, q0, qm0, qp0, rega_out, regb_out, regc_out, regd_out,
+		  regr_out, mantissaB, mantissaA, sel_muxa, sel_muxb, sel_muxr, 
+		  reset, clk,  load_rega, load_regb, load_regc, load_regd,
+		  load_regr, load_regs, P, op_type, exp_odd);
+
+   // FSM : control divider   
+   fsm_div control (done, load_rega, load_regb, load_regc, load_regd, 
+		    load_regr, load_regs, sel_muxa, sel_muxb, sel_muxr, 
+		    clk, reset, start, error, op_type);
+   
+   // Round the mantissa to a 52-bit value, with the leading one
+   // removed. The rounding units also handles special cases and 
+   // set the exception flags.   
+   rounder_div round1 (Result, DenormIO, FlagsIn, 
+		   rm, P, OvEn, UnEn, expF, 
+   		   sel_inv, Invalid, DenormIn, signResult, 
+		   q1, qm1, qp1, q0, qm0, qp0, regr_out);
+
+   // Store the final result and the exception flags in registers.
+   flopenr #(64) rega (clk, reset, done, Result, AS_Result);
+   flopenr #(1) regb (clk, reset, done, DenormIO, Denorm);   
+   flopenr #(5) regc (clk, reset, done, FlagsIn, Flags);   
+   
+endmodule // fpadd
--- a/wally-pipelined/src/fpu/fpdivsqrt/adder_ip.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/adder_ip.sv
@ -1,9 +0,0 @@
-module adder_ip #(parameter WIDTH=8)
-   (input  logic [WIDTH-1:0] a, b,
-    input logic 	     cin,
-    output logic [WIDTH-1:0] y,
-    output logic 	     cout);
-   
-   assign {cout, y} = a + b + cin;
-   
-endmodule // adder
--- a/wally-pipelined/src/fpu/fpdivsqrt/convert_inputs_div.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/convert_inputs_div.sv
@ -3,8 +3,7 @@
 // it conditionally converts single precision values to double 
 // precision values and modifies the sign of op1. 
 // The converted operands are Float1 and Float2.
-
-module convert_inputs(Float1, Float2b, op1, op2, op_type, P);
+module convert_inputs_div (Float1, Float2b, op1, op2, op_type, P);
   
   input logic [63:0]  op1;           // 1st input operand (A)
   input logic [63:0]  op2;           // 2nd input operand (B)
--- a/wally-pipelined/src/fpu/fpdivsqrt/divconvDP.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/divconvDP.sv
@ -1,19 +1,13 @@
 `timescale 1ps/1ps
-module divconv (q1, qm1, qp1, q0, qm0, qp0, 
-		rega_out, regb_out, regc_out, regd_out,
-		regr_out, d, n, 
-		sel_muxa, sel_muxb, sel_muxr, 
-		reset, clk,
-		load_rega, load_regb, load_regc, load_regd,
-		load_regr, load_regs, load_regp,
-		P, op_type, exp_odd);
+module divconv (q1, qm1, qp1, q0, qm0, qp0, rega_out, regb_out, regc_out, regd_out,
+		regr_out, d, n, sel_muxa, sel_muxb, sel_muxr, reset, clk, load_rega, load_regb, 
+		load_regc, load_regd, load_regr, load_regs, P, op_type, exp_odd);

   input logic [52:0]   d, n;
   input logic [2:0] 	sel_muxa, sel_muxb;
   input logic 	        sel_muxr;   
   input logic 	        load_rega, load_regb, load_regc, load_regd;
   input logic 		load_regr, load_regs;
-   input logic 		load_regp;   
   input logic 		P;
   input logic 		op_type;
   input logic 		exp_odd;   
@ -78,86 +72,47 @@ module divconv (q1, qm1, qp1, q0, qm0, qp0,
   mux2 #(64) mx8 ({64'h0000_0000_0000_0200}, {64'h0000_0040_0000_0000}, P, q_const);
   mux2 #(64) mx9 ({64'h0000_0000_0000_0A00}, {64'h0000_0140_0000_0000}, P, qp_const);
   mux2 #(64) mxA ({64'hFFFF_FFFF_FFFF_F9FF}, {64'hFFFF_FF3F_FFFF_FFFF}, P, qm_const);
-
-   logic [127:0] 	Sum_pipe;
-   logic [127:0] 	Carry_pipe;
-   logic 		muxr_pipe;   
-   logic 		rega_pipe;
-   logic 		regb_pipe;
-   logic 		regc_pipe;
-   logic 		regd_pipe;
-   logic 		regs_pipe;
-   logic 		regr_pipe;
-   logic 		P_pipe;
-   logic 		op_type_pipe;
-   logic [63:0] 	q_const_pipe;
-   logic [63:0] 	qm_const_pipe;
-   logic [63:0] 	qp_const_pipe;   
   
-   // Pipeline Stage 2 of iteration for Goldschmidt's algorithm
-   flopenr #(128) regp1 (clk, reset, load_regp, Sum2, Sum_pipe);
-   flopenr #(128) regp2 (clk, reset, load_regp, Carry2, Carry_pipe);
-   flopenr #(1) regp3 (clk, reset, load_regp, muxr_out, muxr_pipe);
-
-   flopenr #(1) regp4 (clk, reset, load_regp, load_rega, rega_pipe);
-   flopenr #(1) regp5 (clk, reset, load_regp, load_regb, regb_pipe);
-   flopenr #(1) regp6 (clk, reset, load_regp, load_regc, regc_pipe);
-   flopenr #(1) regp7 (clk, reset, load_regp, load_regd, regd_pipe);
-   flopenr #(1) regp8 (clk, reset, load_regp, load_regs, regs_pipe);
-   flopenr #(1) regp9 (clk, reset, load_regp, load_regr, regr_pipe);
-   flopenr #(1) regpA (clk, reset, load_regp, P, P_pipe);
-   flopenr #(1) regpB (clk, reset, load_regp, op_type, op_type_pipe);
-   flopenr #(64) regpC (clk, reset, load_regp, q_const, q_const_pipe);
-   flopenr #(64) regpD (clk, reset, load_regp, qp_const, qp_const_pipe);
-   flopenr #(64) regpE (clk, reset, load_regp, qm_const, qm_const_pipe);
-
   // CPA (from CSA)/Remainder addition/subtraction
-   adder_ip #(128) cpa1 (Sum_pipe, Carry_pipe, muxr_pipe, mul_out, cout1);   
-   // ldf128 cpa1 (cout1, mul_out, Sum_pipe, Carry_pipe, muxr_pipe);
-   // One's complement instead of two's complement (for hw efficiency)
-   assign three = {~mul_out[126] , mul_out[126], ~mul_out[125:63]};   
-   mux2 #(64) mxTC (~mul_out[126:63], three[64:1],  op_type_pipe, twocmp_out);
+   adder #(128) cpa1 (Sum2, Carry2, muxr_out, mul_out, cout1);   
   
   // Assuming [1,2) - q1
-   adder_ip #(64) cpa2 (regb_out, q_const_pipe, 1'b0, q_out1, cout2);
-   adder_ip #(64) cpa3 (regb_out, qp_const_pipe, 1'b0, qp_out1, cout3);
-   adder_ip #(64) cpa4 (regb_out, qm_const_pipe, 1'b1, qm_out1, cout4);
-   adder_ip #(64) cpa5 ({regb_out[62:0], vss}, q_const_pipe, 1'b0, q_out0, cout5);
-   adder_ip #(64) cpa6 ({regb_out[62:0], vss}, qp_const_pipe, 1'b0, qp_out0, cout6);
-   adder_ip #(64) cpa7 ({regb_out[62:0], vss}, qm_const_pipe, 1'b1, qm_out0, cout7);      
-  
-   //ldf64 cpa2 (cout2, q_out1, regb_out, q_const_pipe, 1'b0);
-   //ldf64 cpa3 (cout3, qp_out1, regb_out, qp_const_pipe, 1'b0);
-   //ldf64 cpa4 (cout4, qm_out1, regb_out, qm_const_pipe, 1'b1);   
-   // Assuming [0.5,1) - q0
-   //ldf64 cpa5 (cout5, q_out0, {regb_out[62:0], vss}, q_const_pipe, 1'b0);
-   //ldf64 cpa6 (cout6, qp_out0, {regb_out[62:0], vss}, qp_const_pipe, 1'b0);
-   //ldf64 cpa7 (cout7, qm_out0, {regb_out[62:0], vss}, qm_const_pipe, 1'b1);
+   adder #(64) cpa2 (regb_out, q_const, 1'b0, q_out1, cout2);
+   adder #(64) cpa3 (regb_out, qp_const, 1'b0, qp_out1, cout3);
+   adder #(64) cpa4 (regb_out, qm_const, 1'b1, qm_out1, cout4);
+   // Assuming [0.5,1) - q0   
+   adder #(64) cpa5 ({regb_out[62:0], vss}, q_const, 1'b0, q_out0, cout5);
+   adder #(64) cpa6 ({regb_out[62:0], vss}, qp_const, 1'b0, qp_out0, cout6);
+   adder #(64) cpa7 ({regb_out[62:0], vss}, qm_const, 1'b1, qm_out0, cout7);    
+
+   // One's complement instead of two's complement (for hw efficiency)
+   assign three = {~mul_out[126], mul_out[126], ~mul_out[125:63]};   
+   mux2 #(64) mxTC (~mul_out[126:63], three[64:1],  op_type, twocmp_out);

   // regs
-   flopenr #(64) regc (clk, reset, regc_pipe, twocmp_out, regc_out);
-   flopenr #(64) regb (clk, reset, regb_pipe, mul_out[126:63], regb_out);
-   flopenr #(64) rega (clk, reset, rega_pipe, mul_out[126:63], rega_out);
-   flopenr #(64) regd (clk, reset, regd_pipe, mul_out[126:63], regd_out);
-
-   // remainder
-   flopenr #(128) regr (clk, reset, regr_pipe, mul_out, regr_out);
+   flopenr #(64) regc (clk, reset, load_regc, twocmp_out, regc_out);
+   flopenr #(64) regb (clk, reset, load_regb, mul_out[126:63], regb_out);
+   flopenr #(64) rega (clk, reset, load_rega, mul_out[126:63], rega_out);
+   flopenr #(64) regd (clk, reset, load_regd, mul_out[126:63], regd_out);
+   flopenr #(128) regr (clk, reset, load_regr, mul_out, regr_out);
   // Assuming [1,2)
-   flopenr #(64) rege (clk, reset, regs_pipe, {q_out1[63:39], (q_out1[38:10] & {29{~P_pipe}}), 10'h0}, q1);   
-   flopenr #(64) regf (clk, reset, regs_pipe, {qm_out1[63:39], (qm_out1[38:10] & {29{~P_pipe}}), 10'h0}, qm1);
-   flopenr #(64) regg (clk, reset, regs_pipe, {qp_out1[63:39], (qp_out1[38:10] & {29{~P_pipe}}), 10'h0}, qp1);
+   flopenr #(64) rege (clk, reset, load_regs, {q_out1[63:39], (q_out1[38:10] & {29{~P}}), 10'h0}, q1);   
+   flopenr #(64) regf (clk, reset, load_regs, {qm_out1[63:39], (qm_out1[38:10] & {29{~P}}), 10'h0}, qm1);
+   flopenr #(64) regg (clk, reset, load_regs, {qp_out1[63:39], (qp_out1[38:10] & {29{~P}}), 10'h0}, qp1);
   // Assuming [0,1)
-   flopenr #(64) regh (clk, reset, regs_pipe, {q_out0[63:39], (q_out0[38:10] & {29{~P_pipe}}), 10'h0}, q0);
-   flopenr #(64) regj (clk, reset, regs_pipe, {qm_out0[63:39], (qm_out0[38:10] & {29{~P_pipe}}), 10'h0}, qm0);
-   flopenr #(64) regk (clk, reset, regs_pipe, {qp_out0[63:39], (qp_out0[38:10] & {29{~P_pipe}}), 10'h0}, qp0);
+   flopenr #(64) regh (clk, reset, load_regs, {q_out0[63:39], (q_out0[38:10] & {29{~P}}), 10'h0}, q0);
+   flopenr #(64) regj (clk, reset, load_regs, {qm_out0[63:39], (qm_out0[38:10] & {29{~P}}), 10'h0}, qm0);
+   flopenr #(64) regk (clk, reset, load_regs, {qp_out0[63:39], (qp_out0[38:10] & {29{~P}}), 10'h0}, qp0);
   
 endmodule // divconv

 module adder #(parameter WIDTH=8)
   (input  logic [WIDTH-1:0] a, b,
-    output logic [WIDTH-1:0] y);
+    input logic 	     cin,
+    output logic [WIDTH-1:0] y,
+    output logic 	     cout);
   
-   assign y = a + b;
+   assign {cout, y} = a + b + cin;
   
 endmodule // adder

--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`This file only exists so that git will create ./.`